2015 AAQoL: Machine Learning Approaches

Author

Miguel Fudolig, Luke Cho, Lawrence Kim, Boya Liu

library(tidyverse)
library(ggplot2)
library(lavaan)
library(car)

Data set

This data set is from the 2015 Asian American Quality of Life survey. Participants are from Austin, Texas.

Input data set

qol <- read_csv("AAQoL.csv") |> mutate(across(where(is.character), ~as.factor(.x))) |> 
  mutate(`English Difficulties`=relevel(`English Difficulties`,ref="Not at all"),
         `English Speaking`=relevel(`English Speaking`,ref="Not at all"),
         Ethnicity = relevel(Ethnicity,ref="Chinese"))
New names:
Rows: 2609 Columns: 231
── Column specification
──────────────────────────────────────────────────────── Delimiter: "," chr
(190): Gender, Ethnicity, Marital Status, No One, Spouse, Children, Gran... dbl
(41): Survey ID, Age, Education Completed, Household Size, Grandparent,...
ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
Specify the column types or set `show_col_types = FALSE` to quiet this message.
• `Other` -> `Other...17`
• `Other` -> `Other...89`
qol |> DT::datatable()
Warning in instance$preRenderHook(instance): It seems your data is too big for
client-side DataTables. You may consider server-side processing:
https://rstudio.github.io/DT/server.html

There are 2,609 responses, some with missing data.

Machine Learning Classifications

We are going to analyze the prediction accuracy of different machine learning algorithms in predicting the source of health information, healthcare utilization outcomes, and insurance.

We will consider the following algorithms: - Random Forest - Logistic Regression

Source of Information: Family

ps(Family)
# A tibble: 4 × 3
  Family     n     pct
  <fct>  <int>   <dbl>
1 3          1  0.0383
2 No      1258 48.2   
3 Yes     1331 51.0   
4 <NA>      19  0.728 

Random Forest

Without Ethnicity

Training Set

rfdata <- qol |> filter(Family %in% c("No","Yes")) |> 
  mutate(Family=droplevels(Family)) |> 
  select(Family, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income, `English Speaking`, `English Difficulties`) %>%
  na.omit() |> 
  rename(Employment=`Full Time Employment`,
         EnglishSpeak=`English Speaking`,
         EnglishDiff=`English Difficulties`)

pos<- rfdata |> filter(Family=="Yes")
neg <- rfdata |> filter(Family=="No")

set.seed(222)
ind_pos <- sample(2, nrow(pos), replace = TRUE, prob = c(0.7, 0.3))
ind_neg <- sample(2, nrow(neg), replace = TRUE, prob = c(0.7, 0.3))


train <- bind_rows(pos[ind_pos==1,],neg[ind_neg==1,])
test <- bind_rows(pos[ind_pos==2,],neg[ind_neg==2,])

randomForest::randomForest(Family~Age+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff ,
                           data=train,
                           importance=TRUE) -> rf_wo
print(rf_wo)

Call:
 randomForest(formula = Family ~ Age + Gender + Religion + Income +      Employment + EnglishSpeak + EnglishDiff, data = train, importance = TRUE) 
               Type of random forest: classification
                     Number of trees: 500
No. of variables tried at each split: 2

        OOB estimate of  error rate: 42.96%
Confusion matrix:
     No Yes class.error
No  423 367   0.4645570
Yes 329 501   0.3963855
pred_noeth <- predict(rf_wo,train)
caret::confusionMatrix(pred_noeth,train$Family,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction  No Yes
       No  718  52
       Yes  72 778
                                          
               Accuracy : 0.9235          
                 95% CI : (0.9094, 0.9359)
    No Information Rate : 0.5123          
    P-Value [Acc > NIR] : < 2e-16         
                                          
                  Kappa : 0.8467          
                                          
 Mcnemar's Test P-Value : 0.08796         
                                          
            Sensitivity : 0.9373          
            Specificity : 0.9089          
         Pos Pred Value : 0.9153          
         Neg Pred Value : 0.9325          
             Prevalence : 0.5123          
         Detection Rate : 0.4802          
   Detection Prevalence : 0.5247          
      Balanced Accuracy : 0.9231          
                                          
       'Positive' Class : Yes             
                                          

Test set

pred_noeth <- predict(rf_wo,test)
caret::confusionMatrix(pred_noeth,test$Family,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction  No Yes
       No  176 127
       Yes 178 224
                                          
               Accuracy : 0.5674          
                 95% CI : (0.5299, 0.6043)
    No Information Rate : 0.5021          
    P-Value [Acc > NIR] : 0.0002993       
                                          
                  Kappa : 0.1353          
                                          
 Mcnemar's Test P-Value : 0.0041966       
                                          
            Sensitivity : 0.6382          
            Specificity : 0.4972          
         Pos Pred Value : 0.5572          
         Neg Pred Value : 0.5809          
             Prevalence : 0.4979          
         Detection Rate : 0.3177          
   Detection Prevalence : 0.5702          
      Balanced Accuracy : 0.5677          
                                          
       'Positive' Class : Yes             
                                          

ROC Curve

pred_noeth <- predict(rf_wo,test,type="prob")
rocobj_wo <-pROC::roc(test$Family,pred_noeth[,2])
Setting levels: control = No, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_wo)

With Ethnicity

Training Set

set.seed(222)
ind <- sample(2, nrow(rfdata), replace = TRUE, prob = c(0.7, 0.3))

train <- rfdata[ind==1,]
test <- rfdata[ind==2,]

randomForest::randomForest(Family~. ,data=train,
                           importance=TRUE) -> rf_w
print(rf_w)

Call:
 randomForest(formula = Family ~ ., data = train, importance = TRUE) 
               Type of random forest: classification
                     Number of trees: 500
No. of variables tried at each split: 2

        OOB estimate of  error rate: 40.56%
Confusion matrix:
     No Yes class.error
No  444 351   0.4415094
Yes 306 519   0.3709091
pred_eth <- predict(rf_w,train)
caret::confusionMatrix(pred_eth,train$Family,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction  No Yes
       No  722  31
       Yes  73 794
                                          
               Accuracy : 0.9358          
                 95% CI : (0.9227, 0.9472)
    No Information Rate : 0.5093          
    P-Value [Acc > NIR] : < 2.2e-16       
                                          
                  Kappa : 0.8714          
                                          
 Mcnemar's Test P-Value : 5.81e-05        
                                          
            Sensitivity : 0.9624          
            Specificity : 0.9082          
         Pos Pred Value : 0.9158          
         Neg Pred Value : 0.9588          
             Prevalence : 0.5093          
         Detection Rate : 0.4901          
   Detection Prevalence : 0.5352          
      Balanced Accuracy : 0.9353          
                                          
       'Positive' Class : Yes             
                                          

Test Set

pred_eth <- predict(rf_w,test)
caret::confusionMatrix(pred_eth,test$Family,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction  No Yes
       No  192 140
       Yes 157 216
                                          
               Accuracy : 0.5787          
                 95% CI : (0.5413, 0.6155)
    No Information Rate : 0.505           
    P-Value [Acc > NIR] : 5.061e-05       
                                          
                  Kappa : 0.157           
                                          
 Mcnemar's Test P-Value : 0.3532          
                                          
            Sensitivity : 0.6067          
            Specificity : 0.5501          
         Pos Pred Value : 0.5791          
         Neg Pred Value : 0.5783          
             Prevalence : 0.5050          
         Detection Rate : 0.3064          
   Detection Prevalence : 0.5291          
      Balanced Accuracy : 0.5784          
                                          
       'Positive' Class : Yes             
                                          

ROC Curve

pred_eth <- predict(rf_w,test,type="prob")
rocobj <-pROC::roc(test$Family,pred_eth[,2])
Setting levels: control = No, case = Yes
Setting direction: controls < cases
pROC::ggroc(list(NoEthnicity=rocobj_wo,Ethnicity=rocobj))

AUROC

pROC::auc(rocobj)
Area under the curve: 0.6283
pROC::auc(rocobj_wo)
Area under the curve: 0.6175

Variable Importance

randomForest::varImpPlot(rf_w)

randomForest::importance(rf_w)
                    No        Yes MeanDecreaseAccuracy MeanDecreaseGini
Ethnicity     7.918278 10.8645493            15.652593         66.03128
Age          16.654016 15.7460769            26.458185        170.45644
Gender        2.073286 11.5973472            10.273489         25.95290
Religion      4.617839  8.2180796            10.526109         72.95519
Employment    5.919339  3.9165010             8.723458         21.20784
Income       12.917360  0.6092244            10.286733         99.88059
EnglishSpeak  9.143444  8.3621486            15.168532         50.20068
EnglishDiff  12.539277  4.4662574            12.990193         57.47472

Logistic Regression

No ethnicity

Training Set

mod1 <- glm(Family~Age+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff,data=train,family=binomial) 

predict(mod1,train,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "No") |> as.factor()

caret::confusionMatrix(pred_noeth,train$Family,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction  No Yes
       No  460 346
       Yes 335 479
                                          
               Accuracy : 0.5796          
                 95% CI : (0.5552, 0.6038)
    No Information Rate : 0.5093          
    P-Value [Acc > NIR] : 7.809e-09       
                                          
                  Kappa : 0.1592          
                                          
 Mcnemar's Test P-Value : 0.7016          
                                          
            Sensitivity : 0.5806          
            Specificity : 0.5786          
         Pos Pred Value : 0.5885          
         Neg Pred Value : 0.5707          
             Prevalence : 0.5093          
         Detection Rate : 0.2957          
   Detection Prevalence : 0.5025          
      Balanced Accuracy : 0.5796          
                                          
       'Positive' Class : Yes             
                                          

Test Set

predict(mod1,test,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "No") |> as.factor()

caret::confusionMatrix(pred_noeth,test$Family,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction  No Yes
       No  194 156
       Yes 155 200
                                          
               Accuracy : 0.5589          
                 95% CI : (0.5213, 0.5959)
    No Information Rate : 0.505           
    P-Value [Acc > NIR] : 0.002342        
                                          
                  Kappa : 0.1177          
                                          
 Mcnemar's Test P-Value : 1.000000        
                                          
            Sensitivity : 0.5618          
            Specificity : 0.5559          
         Pos Pred Value : 0.5634          
         Neg Pred Value : 0.5543          
             Prevalence : 0.5050          
         Detection Rate : 0.2837          
   Detection Prevalence : 0.5035          
      Balanced Accuracy : 0.5588          
                                          
       'Positive' Class : Yes             
                                          

ROC

rocobj_wo <-pROC::roc(test$Family,predicted_probs)
Setting levels: control = No, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_wo)

pROC::auc(rocobj_wo)
Area under the curve: 0.5862

With ethnicity

Training Set

mod1w <- glm(Family~Age+Ethnicity+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff,data=train,family=binomial) 

predict(mod1w,train,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "No") |> as.factor()

caret::confusionMatrix(pred_noeth,train$Family,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction  No Yes
       No  488 308
       Yes 307 517
                                          
               Accuracy : 0.6204          
                 95% CI : (0.5962, 0.6441)
    No Information Rate : 0.5093          
    P-Value [Acc > NIR] : <2e-16          
                                          
                  Kappa : 0.2405          
                                          
 Mcnemar's Test P-Value : 1               
                                          
            Sensitivity : 0.6267          
            Specificity : 0.6138          
         Pos Pred Value : 0.6274          
         Neg Pred Value : 0.6131          
             Prevalence : 0.5093          
         Detection Rate : 0.3191          
   Detection Prevalence : 0.5086          
      Balanced Accuracy : 0.6203          
                                          
       'Positive' Class : Yes             
                                          

Test Set

predict(mod1w,test,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "No") |> as.factor()

caret::confusionMatrix(pred_noeth,test$Family,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction  No Yes
       No  211 145
       Yes 138 211
                                         
               Accuracy : 0.5986         
                 95% CI : (0.5613, 0.635)
    No Information Rate : 0.505          
    P-Value [Acc > NIR] : 3.696e-07      
                                         
                  Kappa : 0.1972         
                                         
 Mcnemar's Test P-Value : 0.7213         
                                         
            Sensitivity : 0.5927         
            Specificity : 0.6046         
         Pos Pred Value : 0.6046         
         Neg Pred Value : 0.5927         
             Prevalence : 0.5050         
         Detection Rate : 0.2993         
   Detection Prevalence : 0.4950         
      Balanced Accuracy : 0.5986         
                                         
       'Positive' Class : Yes            
                                         

ROC

rocobj_w <-pROC::roc(test$Family,predicted_probs)
Setting levels: control = No, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_w)

AUROC

pROC::auc(rocobj_w)
Area under the curve: 0.6195

Logistic Regression (ANOVA)

glm(Family~Age+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff,data=rfdata,family=binomial) -> mod1
glm(Family~Age+Ethnicity+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff,data=rfdata,family=binomial) ->mod1w

ANOVA analysis

summary(mod1w)

Call:
glm(formula = Family ~ Age + Ethnicity + Gender + Religion + 
    Income + Employment + EnglishSpeak + EnglishDiff, family = binomial, 
    data = rfdata)

Coefficients:
                              Estimate Std. Error z value Pr(>|z|)    
(Intercept)                   1.663761   0.332233   5.008 5.51e-07 ***
Age                          -0.015172   0.002895  -5.241 1.60e-07 ***
EthnicityAsian Indian        -0.283323   0.259350  -1.092 0.274643    
EthnicityFilipino            -0.209576   0.197124  -1.063 0.287707    
EthnicityKorean              -0.619568   0.147716  -4.194 2.74e-05 ***
EthnicityOther               -0.774617   0.218603  -3.543 0.000395 ***
EthnicityVietnamese          -0.758603   0.156671  -4.842 1.29e-06 ***
GenderMale                    0.278524   0.090508   3.077 0.002089 ** 
ReligionCatholic             -0.009392   0.166826  -0.056 0.955104    
ReligionHindu                -0.255708   0.279422  -0.915 0.360122    
ReligionMuslim               -0.203595   0.335527  -0.607 0.543989    
ReligionNone                 -0.190725   0.169115  -1.128 0.259413    
ReligionOther                -0.599988   0.359218  -1.670 0.094867 .  
ReligionProtestant           -0.082695   0.169427  -0.488 0.625488    
Income$10,000 - $19,999      -0.293073   0.204786  -1.431 0.152397    
Income$20,000 - $29,999      -0.355893   0.208565  -1.706 0.087936 .  
Income$30,000 - $39,999      -0.410814   0.204680  -2.007 0.044739 *  
Income$40,000 - $49,999      -0.498557   0.216107  -2.307 0.021056 *  
Income$50,000 - $59,999      -0.262414   0.213631  -1.228 0.219315    
Income$60,000 - $69,999      -0.516755   0.209767  -2.463 0.013760 *  
Income$70,000 and over       -0.449772   0.163777  -2.746 0.006028 ** 
EmploymentEmployed full time -0.372539   0.096073  -3.878 0.000105 ***
EnglishSpeakNot well         -0.137994   0.223937  -0.616 0.537750    
EnglishSpeakVery well         0.009748   0.236859   0.041 0.967174    
EnglishSpeakWell             -0.214154   0.224791  -0.953 0.340752    
EnglishDiffMuch               0.390928   0.143654   2.721 0.006502 ** 
EnglishDiffNot much           0.016162   0.133783   0.121 0.903841    
EnglishDiffVery much         -0.245538   0.128476  -1.911 0.055984 .  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 3222.5  on 2324  degrees of freedom
Residual deviance: 3090.4  on 2297  degrees of freedom
AIC: 3146.4

Number of Fisher Scoring iterations: 4
car::Anova(mod1w)
Analysis of Deviance Table (Type II tests)

Response: Family
             LR Chisq Df Pr(>Chisq)    
Age            27.803  1  1.343e-07 ***
Ethnicity      38.633  5  2.815e-07 ***
Gender          9.521  1  0.0020313 ** 
Religion        4.354  6  0.6289341    
Income          9.873  7  0.1958741    
Employment     15.115  1  0.0001011 ***
EnglishSpeak    3.876  3  0.2751648    
EnglishDiff    19.812  3  0.0001857 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
car::vif(mod1w)
                  GVIF Df GVIF^(1/(2*Df))
Age           1.220700  1        1.104853
Ethnicity    15.005880  5        1.311071
Gender        1.118742  1        1.057706
Religion     13.250791  6        1.240281
Income        1.462715  7        1.027536
Employment    1.262994  1        1.123830
EnglishSpeak  2.419635  3        1.158666
EnglishDiff   1.827597  3        1.105724
anova(mod1,mod1w,test="LRT")
Analysis of Deviance Table

Model 1: Family ~ Age + Gender + Religion + Income + Employment + EnglishSpeak + 
    EnglishDiff
Model 2: Family ~ Age + Ethnicity + Gender + Religion + Income + Employment + 
    EnglishSpeak + EnglishDiff
  Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
1      2302     3129.0                          
2      2297     3090.4  5   38.633 2.815e-07 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

AIC/BIC values

data.frame(AIC_wo = AIC(mod1), AIC_w=AIC(mod1w))
    AIC_wo    AIC_w
1 3174.988 3146.356
data.frame(BIC_wo = BIC(mod1), BIC_w=BIC(mod1w))
    BIC_wo    BIC_w
1 3307.272 3307.397

Source of Information: Health Professional

ps(`Heal Professionals`)
# A tibble: 3 × 3
  `Heal Professionals`     n    pct
  <fct>                <int>  <dbl>
1 No                    1326 50.8  
2 Yes                   1264 48.4  
3 <NA>                    19  0.728

Random Forest

Without Ethnicity

Training Set

rfdata <- qol |> 
  select(`Heal Professionals`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income, `English Speaking`, `English Difficulties`) %>%
  na.omit() |> 
  rename(Employment=`Full Time Employment`,
         EnglishSpeak=`English Speaking`,
         EnglishDiff=`English Difficulties`)

set.seed(222)
pos<- rfdata |> filter(`Heal Professionals`=="Yes")
neg <- rfdata |> filter(`Heal Professionals`=="No")

set.seed(222)
ind_pos <- sample(2, nrow(pos), replace = TRUE, prob = c(0.7, 0.3))
ind_neg <- sample(2, nrow(neg), replace = TRUE, prob = c(0.7, 0.3))


train <- bind_rows(pos[ind_pos==1,],neg[ind_neg==1,])
test <- bind_rows(pos[ind_pos==2,],neg[ind_neg==2,])

randomForest::randomForest(`Heal Professionals`~Age+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff ,
                           data=train,
                           importance=TRUE) -> rf_wo
print(rf_wo)

Call:
 randomForest(formula = `Heal Professionals` ~ Age + Gender +      Religion + Income + Employment + EnglishSpeak + EnglishDiff,      data = train, importance = TRUE) 
               Type of random forest: classification
                     Number of trees: 500
No. of variables tried at each split: 2

        OOB estimate of  error rate: 41.23%
Confusion matrix:
     No Yes class.error
No  428 362   0.4582278
Yes 306 524   0.3686747
pred_noeth <- predict(rf_wo,train)
caret::confusionMatrix(pred_noeth,train$`Heal Professionals`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction  No Yes
       No  677  46
       Yes 113 784
                                          
               Accuracy : 0.9019          
                 95% CI : (0.8863, 0.9159)
    No Information Rate : 0.5123          
    P-Value [Acc > NIR] : < 2.2e-16       
                                          
                  Kappa : 0.8032          
                                          
 Mcnemar's Test P-Value : 1.658e-07       
                                          
            Sensitivity : 0.9446          
            Specificity : 0.8570          
         Pos Pred Value : 0.8740          
         Neg Pred Value : 0.9364          
             Prevalence : 0.5123          
         Detection Rate : 0.4840          
   Detection Prevalence : 0.5537          
      Balanced Accuracy : 0.9008          
                                          
       'Positive' Class : Yes             
                                          

Test set

pred_noeth <- predict(rf_wo,test)
caret::confusionMatrix(pred_noeth,test$`Heal Professionals`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction  No Yes
       No  181 126
       Yes 174 225
                                          
               Accuracy : 0.5751          
                 95% CI : (0.5376, 0.6119)
    No Information Rate : 0.5028          
    P-Value [Acc > NIR] : 6.996e-05       
                                          
                  Kappa : 0.1508          
                                          
 Mcnemar's Test P-Value : 0.006657        
                                          
            Sensitivity : 0.6410          
            Specificity : 0.5099          
         Pos Pred Value : 0.5639          
         Neg Pred Value : 0.5896          
             Prevalence : 0.4972          
         Detection Rate : 0.3187          
   Detection Prevalence : 0.5652          
      Balanced Accuracy : 0.5754          
                                          
       'Positive' Class : Yes             
                                          

ROC Curve

pred_noeth <- predict(rf_wo,test,type="prob")
rocobj_wo <-pROC::roc(test$`Heal Professionals`,pred_noeth[,2])
Setting levels: control = No, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_wo)

With Ethnicity

Training Set

# rfdata <- qol |>
#   select(`Heal Professionals`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income, `English Speaking`, `English Difficulties`) %>%
#   na.omit() |> 
#   rename(Employment=`Full Time Employment`,
#          EnglishSpeak=`English Speaking`,
#          EnglishDiff=`English Difficulties`)
# 
# set.seed(222)
# ind <- sample(2, nrow(rfdata), replace = TRUE, prob = c(0.7, 0.3))
# 
# train <- rfdata[ind==1,]
# test <- rfdata[ind==2,]

randomForest::randomForest(`Heal Professionals`~. ,data=train,
                           importance=TRUE) -> rf_w
print(rf_w)

Call:
 randomForest(formula = `Heal Professionals` ~ ., data = train,      importance = TRUE) 
               Type of random forest: classification
                     Number of trees: 500
No. of variables tried at each split: 2

        OOB estimate of  error rate: 42.1%
Confusion matrix:
     No Yes class.error
No  409 381   0.4822785
Yes 301 529   0.3626506
pred_eth <- predict(rf_w,train)
caret::confusionMatrix(pred_eth,train$`Heal Professionals`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction  No Yes
       No  704  43
       Yes  86 787
                                          
               Accuracy : 0.9204          
                 95% CI : (0.9061, 0.9331)
    No Information Rate : 0.5123          
    P-Value [Acc > NIR] : < 2.2e-16       
                                          
                  Kappa : 0.8404          
                                          
 Mcnemar's Test P-Value : 0.0002174       
                                          
            Sensitivity : 0.9482          
            Specificity : 0.8911          
         Pos Pred Value : 0.9015          
         Neg Pred Value : 0.9424          
             Prevalence : 0.5123          
         Detection Rate : 0.4858          
   Detection Prevalence : 0.5389          
      Balanced Accuracy : 0.9197          
                                          
       'Positive' Class : Yes             
                                          
pred_eth <- predict(rf_w,test)
caret::confusionMatrix(pred_eth,test$`Heal Professionals`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction  No Yes
       No  179 122
       Yes 176 229
                                          
               Accuracy : 0.5779          
                 95% CI : (0.5405, 0.6147)
    No Information Rate : 0.5028          
    P-Value [Acc > NIR] : 3.752e-05       
                                          
                  Kappa : 0.1565          
                                          
 Mcnemar's Test P-Value : 0.002139        
                                          
            Sensitivity : 0.6524          
            Specificity : 0.5042          
         Pos Pred Value : 0.5654          
         Neg Pred Value : 0.5947          
             Prevalence : 0.4972          
         Detection Rate : 0.3244          
   Detection Prevalence : 0.5737          
      Balanced Accuracy : 0.5783          
                                          
       'Positive' Class : Yes             
                                          

ROC Curve

pred_eth <- predict(rf_w,test,type="prob")
rocobj <-pROC::roc(test$`Heal Professionals`,pred_eth[,2])
Setting levels: control = No, case = Yes
Setting direction: controls < cases
pROC::ggroc(list(NoEthnicity=rocobj_wo,Ethnicity=rocobj))

AUROC

pROC::auc(rocobj)
Area under the curve: 0.6269
pROC::auc(rocobj_wo)
Area under the curve: 0.6207

Variable Importance

randomForest::varImpPlot(rf_w)

randomForest::importance(rf_w)
                     No         Yes MeanDecreaseAccuracy MeanDecreaseGini
Ethnicity     5.9860318  9.60330404            13.049013         68.63717
Age           7.0329957  4.75500036            10.080950        154.94856
Gender        5.3711218 -2.95635281             2.138476         25.31325
Religion      9.7787112 -0.02549784             7.782594         72.44490
Employment   -1.5342540 -1.77234860            -2.779661         20.60935
Income       -0.7468972 15.76511423            11.391666        100.79240
EnglishSpeak  7.2331614 19.06777183            22.152922         53.80531
EnglishDiff   3.7127127  9.91157749            10.384253         58.43659

Logistic Regression

No ethnicity

Training Set

mod1 <- glm(`Heal Professionals`~Age+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff,data=train,family=binomial) 

predict(mod1,train,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "No") |> as.factor()

caret::confusionMatrix(pred_noeth,train$`Heal Professionals`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction  No Yes
       No  484 289
       Yes 306 541
                                          
               Accuracy : 0.6327          
                 95% CI : (0.6087, 0.6562)
    No Information Rate : 0.5123          
    P-Value [Acc > NIR] : <2e-16          
                                          
                  Kappa : 0.2646          
                                          
 Mcnemar's Test P-Value : 0.5119          
                                          
            Sensitivity : 0.6518          
            Specificity : 0.6127          
         Pos Pred Value : 0.6387          
         Neg Pred Value : 0.6261          
             Prevalence : 0.5123          
         Detection Rate : 0.3340          
   Detection Prevalence : 0.5228          
      Balanced Accuracy : 0.6322          
                                          
       'Positive' Class : Yes             
                                          

Test Set

predict(mod1,test,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "No") |> as.factor()

caret::confusionMatrix(pred_noeth,test$`Heal Professionals`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction  No Yes
       No  198 123
       Yes 157 228
                                          
               Accuracy : 0.6034          
                 95% CI : (0.5662, 0.6397)
    No Information Rate : 0.5028          
    P-Value [Acc > NIR] : 5.022e-08       
                                          
                  Kappa : 0.2072          
                                          
 Mcnemar's Test P-Value : 0.0486          
                                          
            Sensitivity : 0.6496          
            Specificity : 0.5577          
         Pos Pred Value : 0.5922          
         Neg Pred Value : 0.6168          
             Prevalence : 0.4972          
         Detection Rate : 0.3229          
   Detection Prevalence : 0.5453          
      Balanced Accuracy : 0.6037          
                                          
       'Positive' Class : Yes             
                                          

ROC

rocobj_wo <-pROC::roc(test$`Heal Professionals`,predicted_probs)
Setting levels: control = No, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_wo)

pROC::auc(rocobj_wo)
Area under the curve: 0.6429

With ethnicity

Training Set

mod1 <- glm(`Heal Professionals`~Age+Ethnicity+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff,data=train,family=binomial) 

predict(mod1,train,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "No") |> as.factor()

caret::confusionMatrix(pred_noeth,train$`Heal Professionals`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction  No Yes
       No  485 294
       Yes 305 536
                                          
               Accuracy : 0.6302          
                 95% CI : (0.6062, 0.6538)
    No Information Rate : 0.5123          
    P-Value [Acc > NIR] : <2e-16          
                                          
                  Kappa : 0.2598          
                                          
 Mcnemar's Test P-Value : 0.6828          
                                          
            Sensitivity : 0.6458          
            Specificity : 0.6139          
         Pos Pred Value : 0.6373          
         Neg Pred Value : 0.6226          
             Prevalence : 0.5123          
         Detection Rate : 0.3309          
   Detection Prevalence : 0.5191          
      Balanced Accuracy : 0.6299          
                                          
       'Positive' Class : Yes             
                                          

Test Set

predict(mod1,test,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "No") |> as.factor()

caret::confusionMatrix(pred_noeth,test$`Heal Professionals`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction  No Yes
       No  203 130
       Yes 152 221
                                          
               Accuracy : 0.6006          
                 95% CI : (0.5634, 0.6369)
    No Information Rate : 0.5028          
    P-Value [Acc > NIR] : 1.147e-07       
                                          
                  Kappa : 0.2014          
                                          
 Mcnemar's Test P-Value : 0.2111          
                                          
            Sensitivity : 0.6296          
            Specificity : 0.5718          
         Pos Pred Value : 0.5925          
         Neg Pred Value : 0.6096          
             Prevalence : 0.4972          
         Detection Rate : 0.3130          
   Detection Prevalence : 0.5283          
      Balanced Accuracy : 0.6007          
                                          
       'Positive' Class : Yes             
                                          

ROC

rocobj_w <-pROC::roc(test$`Heal Professionals`,predicted_probs)
Setting levels: control = No, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_w)

pROC::auc(rocobj_w)
Area under the curve: 0.6418

Logistic Regression (ANOVA)

glm(`Heal Professionals`~Age+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff,data=rfdata,family=binomial) -> mod1
glm(`Heal Professionals`~Age+Ethnicity+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff,data=rfdata,family=binomial) ->mod1w

ANOVA analysis

summary(mod1w)

Call:
glm(formula = `Heal Professionals` ~ Age + Ethnicity + Gender + 
    Religion + Income + Employment + EnglishSpeak + EnglishDiff, 
    family = binomial, data = rfdata)

Coefficients:
                              Estimate Std. Error z value Pr(>|z|)    
(Intercept)                  -1.072165   0.340255  -3.151  0.00163 ** 
Age                           0.012777   0.002978   4.291 1.78e-05 ***
EthnicityAsian Indian        -0.341146   0.263054  -1.297  0.19468    
EthnicityFilipino             0.322119   0.207063   1.556  0.11979    
EthnicityKorean              -0.426939   0.150828  -2.831  0.00465 ** 
EthnicityOther               -0.140765   0.220803  -0.638  0.52379    
EthnicityVietnamese           0.024371   0.158357   0.154  0.87769    
GenderMale                    0.001829   0.092204   0.020  0.98417    
ReligionCatholic             -0.251763   0.169865  -1.482  0.13831    
ReligionHindu                -0.601409   0.282815  -2.127  0.03346 *  
ReligionMuslim                0.144213   0.342914   0.421  0.67408    
ReligionNone                 -0.348254   0.171327  -2.033  0.04208 *  
ReligionOther                -0.002626   0.363756  -0.007  0.99424    
ReligionProtestant           -0.288234   0.171901  -1.677  0.09359 .  
Income$10,000 - $19,999      -0.069905   0.206716  -0.338  0.73524    
Income$20,000 - $29,999       0.087219   0.208163   0.419  0.67522    
Income$30,000 - $39,999      -0.022090   0.206085  -0.107  0.91464    
Income$40,000 - $49,999      -0.003825   0.217525  -0.018  0.98597    
Income$50,000 - $59,999       0.275124   0.215593   1.276  0.20191    
Income$60,000 - $69,999       0.068545   0.212114   0.323  0.74658    
Income$70,000 and over        0.471937   0.164103   2.876  0.00403 ** 
EmploymentEmployed full time -0.007485   0.098315  -0.076  0.93931    
EnglishSpeakNot well          0.460299   0.236839   1.944  0.05195 .  
EnglishSpeakVery well         1.459027   0.250189   5.832 5.49e-09 ***
EnglishSpeakWell              1.013225   0.237330   4.269 1.96e-05 ***
EnglishDiffMuch              -0.369009   0.145860  -2.530  0.01141 *  
EnglishDiffNot much          -0.306210   0.136116  -2.250  0.02447 *  
EnglishDiffVery much         -0.332534   0.131642  -2.526  0.01154 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 3224.0  on 2325  degrees of freedom
Residual deviance: 2984.2  on 2298  degrees of freedom
AIC: 3040.2

Number of Fisher Scoring iterations: 4
car::Anova(mod1w)
Analysis of Deviance Table (Type II tests)

Response: Heal Professionals
             LR Chisq Df Pr(>Chisq)    
Age            18.618  1  1.597e-05 ***
Ethnicity      18.242  5   0.002658 ** 
Gender          0.000  1   0.984173    
Religion       11.608  6   0.071317 .  
Income         22.002  7   0.002538 ** 
Employment      0.006  1   0.939307    
EnglishSpeak   55.796  3  4.644e-12 ***
EnglishDiff     9.309  3   0.025447 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
car::vif(mod1w)
                  GVIF Df GVIF^(1/(2*Df))
Age           1.246279  1        1.116369
Ethnicity    14.198423  5        1.303839
Gender        1.108640  1        1.052920
Religion     12.767229  6        1.236445
Income        1.460980  7        1.027449
Employment    1.260945  1        1.122918
EnglishSpeak  2.366930  3        1.154421
EnglishDiff   1.776456  3        1.100506
anova(mod1,mod1w,test="LRT")
Analysis of Deviance Table

Model 1: `Heal Professionals` ~ Age + Gender + Religion + Income + Employment + 
    EnglishSpeak + EnglishDiff
Model 2: `Heal Professionals` ~ Age + Ethnicity + Gender + Religion + 
    Income + Employment + EnglishSpeak + EnglishDiff
  Resid. Df Resid. Dev Df Deviance Pr(>Chi)   
1      2303     3002.5                        
2      2298     2984.2  5   18.242 0.002658 **
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

AIC/BIC values

data.frame(AIC_wo = AIC(mod1), AIC_w=AIC(mod1w))
    AIC_wo    AIC_w
1 3048.475 3040.234
data.frame(BIC_wo = BIC(mod1), BIC_w=BIC(mod1w))
    BIC_wo    BIC_w
1 3180.769 3201.287

Health Insurance

ps(`Health Insurance`)
# A tibble: 3 × 3
  `Health Insurance`     n    pct
  <fct>              <int>  <dbl>
1 0                    381 14.6  
2 Yes                 2207 84.6  
3 <NA>                  21  0.805

Random Forest

Without Ethnicity

Training Set

rfdata <- qol |> 
  select(`Health Insurance`,Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income, `English Speaking`, `English Difficulties`) %>%
  na.omit() |> 
  rename(Employment=`Full Time Employment`,
         EnglishSpeak=`English Speaking`,
         EnglishDiff=`English Difficulties`)

pos<- rfdata |> filter(`Health Insurance`=="Yes")
neg <- rfdata |> filter(`Health Insurance`==0)

set.seed(222)
ind_pos <- sample(2, nrow(pos), replace = TRUE, prob = c(0.7, 0.3))
ind_neg <- sample(2, nrow(neg), replace = TRUE, prob = c(0.7, 0.3))


train <- bind_rows(pos[ind_pos==1,],neg[ind_neg==1,])
test <- bind_rows(pos[ind_pos==2,],neg[ind_neg==2,])

randomForest::randomForest(`Health Insurance`~Age+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff ,
                           data=train,
                           importance=TRUE) -> rf_wo
print(rf_wo)

Call:
 randomForest(formula = `Health Insurance` ~ Age + Gender + Religion +      Income + Employment + EnglishSpeak + EnglishDiff, data = train,      importance = TRUE) 
               Type of random forest: classification
                     Number of trees: 500
No. of variables tried at each split: 2

        OOB estimate of  error rate: 13.58%
Confusion matrix:
     0  Yes class.error
0   16  203  0.92694064
Yes 17 1384  0.01213419
pred_noeth <- predict(rf_wo,train)
caret::confusionMatrix(pred_noeth,train$`Health Insurance`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction    0  Yes
       0    132    2
       Yes   87 1399
                                          
               Accuracy : 0.9451          
                 95% CI : (0.9328, 0.9557)
    No Information Rate : 0.8648          
    P-Value [Acc > NIR] : < 2.2e-16       
                                          
                  Kappa : 0.719           
                                          
 Mcnemar's Test P-Value : < 2.2e-16       
                                          
            Sensitivity : 0.9986          
            Specificity : 0.6027          
         Pos Pred Value : 0.9415          
         Neg Pred Value : 0.9851          
             Prevalence : 0.8648          
         Detection Rate : 0.8636          
   Detection Prevalence : 0.9173          
      Balanced Accuracy : 0.8007          
                                          
       'Positive' Class : Yes             
                                          

Test set

pred_noeth <- predict(rf_wo,test)
caret::confusionMatrix(pred_noeth,test$`Health Insurance`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0     7  16
       Yes  92 591
                                          
               Accuracy : 0.847           
                 95% CI : (0.8183, 0.8728)
    No Information Rate : 0.8598          
    P-Value [Acc > NIR] : 0.8483          
                                          
                  Kappa : 0.0653          
                                          
 Mcnemar's Test P-Value : 5.319e-13       
                                          
            Sensitivity : 0.97364         
            Specificity : 0.07071         
         Pos Pred Value : 0.86530         
         Neg Pred Value : 0.30435         
             Prevalence : 0.85977         
         Detection Rate : 0.83711         
   Detection Prevalence : 0.96742         
      Balanced Accuracy : 0.52217         
                                          
       'Positive' Class : Yes             
                                          

ROC Curve

pred_noeth <- predict(rf_wo,test,type="prob")
rocobj_wo <-pROC::roc(test$`Health Insurance`,pred_noeth[,2])
Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_wo)

With Ethnicity

Training Set

# rfdata <- qol |>
#   select(`Health Insurance`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income, `English Speaking`, `English Difficulties`) %>%
#   na.omit() |> 
#   rename(Employment=`Full Time Employment`,
#          EnglishSpeak=`English Speaking`,
#          EnglishDiff=`English Difficulties`)
# 
# set.seed(222)
# ind <- sample(2, nrow(rfdata), replace = TRUE, prob = c(0.7, 0.3))
# 
# train <- rfdata[ind==1,]
# test <- rfdata[ind==2,]

randomForest::randomForest(`Health Insurance`~. ,data=train,
                           importance=TRUE) -> rf_w
print(rf_w)

Call:
 randomForest(formula = `Health Insurance` ~ ., data = train,      importance = TRUE) 
               Type of random forest: classification
                     Number of trees: 500
No. of variables tried at each split: 2

        OOB estimate of  error rate: 13.46%
Confusion matrix:
     0  Yes class.error
0   18  201  0.91780822
Yes 17 1384  0.01213419
pred_eth <- predict(rf_w,train)
caret::confusionMatrix(pred_eth,train$`Health Insurance`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction    0  Yes
       0    156    2
       Yes   63 1399
                                          
               Accuracy : 0.9599          
                 95% CI : (0.9491, 0.9689)
    No Information Rate : 0.8648          
    P-Value [Acc > NIR] : < 2.2e-16       
                                          
                  Kappa : 0.8056          
                                          
 Mcnemar's Test P-Value : 9.911e-14       
                                          
            Sensitivity : 0.9986          
            Specificity : 0.7123          
         Pos Pred Value : 0.9569          
         Neg Pred Value : 0.9873          
             Prevalence : 0.8648          
         Detection Rate : 0.8636          
   Detection Prevalence : 0.9025          
      Balanced Accuracy : 0.8555          
                                          
       'Positive' Class : Yes             
                                          
pred_eth <- predict(rf_w,test)
caret::confusionMatrix(pred_eth,test$`Health Insurance`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0    10  12
       Yes  89 595
                                          
               Accuracy : 0.8569          
                 95% CI : (0.8289, 0.8819)
    No Information Rate : 0.8598          
    P-Value [Acc > NIR] : 0.6114          
                                          
                  Kappa : 0.1204          
                                          
 Mcnemar's Test P-Value : 3.961e-14       
                                          
            Sensitivity : 0.9802          
            Specificity : 0.1010          
         Pos Pred Value : 0.8699          
         Neg Pred Value : 0.4545          
             Prevalence : 0.8598          
         Detection Rate : 0.8428          
   Detection Prevalence : 0.9688          
      Balanced Accuracy : 0.5406          
                                          
       'Positive' Class : Yes             
                                          

ROC Curve

pred_eth <- predict(rf_w,test,type="prob")
rocobj <-pROC::roc(test$`Health Insurance`,pred_eth[,2])
Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(list(NoEthnicity=rocobj_wo,Ethnicity=rocobj))

AUROC

pROC::auc(rocobj)
Area under the curve: 0.6666
pROC::auc(rocobj_wo)
Area under the curve: 0.6534

Variable Importance

randomForest::varImpPlot(rf_w)

randomForest::importance(rf_w)
                     0       Yes MeanDecreaseAccuracy MeanDecreaseGini
Ethnicity     2.400857 15.368673            16.107945         33.73518
Age           5.864815 11.132723            13.570007         72.90367
Gender       -6.129335  7.159769             3.938031         12.15898
Religion      3.565750 12.763937            13.211563         36.02166
Employment    4.416671 13.315128            14.557834         11.33611
Income       19.851050 11.218608            18.695617         61.88486
EnglishSpeak 21.295650 13.206903            20.516757         31.91132
EnglishDiff  -3.356981  9.806920             7.919856         26.51002

Logistic Regression

No ethnicity

Training Set

mod1 <- glm(`Health Insurance`~Age+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff,data=train,family=binomial) 

predict(mod1,train,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()

caret::confusionMatrix(pred_noeth,train$`Health Insurance`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction    0  Yes
       0     17   14
       Yes  202 1387
                                          
               Accuracy : 0.8667          
                 95% CI : (0.8491, 0.8829)
    No Information Rate : 0.8648          
    P-Value [Acc > NIR] : 0.4313          
                                          
                  Kappa : 0.106           
                                          
 Mcnemar's Test P-Value : <2e-16          
                                          
            Sensitivity : 0.99001         
            Specificity : 0.07763         
         Pos Pred Value : 0.87288         
         Neg Pred Value : 0.54839         
             Prevalence : 0.86481         
         Detection Rate : 0.85617         
   Detection Prevalence : 0.98086         
      Balanced Accuracy : 0.53382         
                                          
       'Positive' Class : Yes             
                                          

Test Set

predict(mod1,test,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()

caret::confusionMatrix(pred_noeth,test$`Health Insurance`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0    12   7
       Yes  87 600
                                         
               Accuracy : 0.8669         
                 95% CI : (0.8396, 0.891)
    No Information Rate : 0.8598         
    P-Value [Acc > NIR] : 0.3164         
                                         
                  Kappa : 0.1657         
                                         
 Mcnemar's Test P-Value : 3.693e-16      
                                         
            Sensitivity : 0.9885         
            Specificity : 0.1212         
         Pos Pred Value : 0.8734         
         Neg Pred Value : 0.6316         
             Prevalence : 0.8598         
         Detection Rate : 0.8499         
   Detection Prevalence : 0.9731         
      Balanced Accuracy : 0.5548         
                                         
       'Positive' Class : Yes            
                                         

ROC

rocobj_wo <-pROC::roc(test$`Health Insurance`,predicted_probs)
Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_wo)

pROC::auc(rocobj_wo)
Area under the curve: 0.6906

With ethnicity

Training Set

mod1 <- glm(`Health Insurance`~Age+Ethnicity+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff,data=train,family=binomial) 

predict(mod1,train,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()

caret::confusionMatrix(pred_noeth,train$`Health Insurance`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction    0  Yes
       0     20   13
       Yes  199 1388
                                          
               Accuracy : 0.8691          
                 95% CI : (0.8517, 0.8852)
    No Information Rate : 0.8648          
    P-Value [Acc > NIR] : 0.3208          
                                          
                  Kappa : 0.1279          
                                          
 Mcnemar's Test P-Value : <2e-16          
                                          
            Sensitivity : 0.99072         
            Specificity : 0.09132         
         Pos Pred Value : 0.87461         
         Neg Pred Value : 0.60606         
             Prevalence : 0.86481         
         Detection Rate : 0.85679         
   Detection Prevalence : 0.97963         
      Balanced Accuracy : 0.54102         
                                          
       'Positive' Class : Yes             
                                          

Test Set

predict(mod1,test,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()

caret::confusionMatrix(pred_noeth,test$`Health Insurance`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0     9  10
       Yes  90 597
                                          
               Accuracy : 0.8584          
                 95% CI : (0.8304, 0.8832)
    No Information Rate : 0.8598          
    P-Value [Acc > NIR] : 0.5695          
                                          
                  Kappa : 0.1125          
                                          
 Mcnemar's Test P-Value : 2.789e-15       
                                          
            Sensitivity : 0.98353         
            Specificity : 0.09091         
         Pos Pred Value : 0.86900         
         Neg Pred Value : 0.47368         
             Prevalence : 0.85977         
         Detection Rate : 0.84561         
   Detection Prevalence : 0.97309         
      Balanced Accuracy : 0.53722         
                                          
       'Positive' Class : Yes             
                                          

ROC

rocobj_w <-pROC::roc(test$`Health Insurance`,predicted_probs)
Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_w)

pROC::auc(rocobj_w)
Area under the curve: 0.6863

Logistic Regression (ANOVA)

glm(`Health Insurance`~Age+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff,data=rfdata,family=binomial) -> mod1
glm(`Health Insurance`~Age+Ethnicity+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff,data=rfdata,family=binomial) ->mod1w

ANOVA analysis

summary(mod1w)

Call:
glm(formula = `Health Insurance` ~ Age + Ethnicity + Gender + 
    Religion + Income + Employment + EnglishSpeak + EnglishDiff, 
    family = binomial, data = rfdata)

Coefficients:
                              Estimate Std. Error z value Pr(>|z|)    
(Intercept)                  -0.759266   0.413926  -1.834  0.06661 .  
Age                           0.010922   0.004052   2.696  0.00702 ** 
EthnicityAsian Indian        -0.446479   0.408123  -1.094  0.27396    
EthnicityFilipino            -0.438511   0.317948  -1.379  0.16784    
EthnicityKorean              -0.620004   0.221913  -2.794  0.00521 ** 
EthnicityOther               -0.789804   0.310543  -2.543  0.01098 *  
EthnicityVietnamese          -0.307298   0.232873  -1.320  0.18697    
GenderMale                   -0.111812   0.134537  -0.831  0.40593    
ReligionCatholic              0.411070   0.250368   1.642  0.10062    
ReligionHindu                 0.179499   0.431971   0.416  0.67775    
ReligionMuslim               -0.074638   0.455515  -0.164  0.86985    
ReligionNone                  0.171705   0.244416   0.703  0.48236    
ReligionOther                -0.905441   0.420837  -2.152  0.03143 *  
ReligionProtestant            0.060117   0.246737   0.244  0.80750    
Income$10,000 - $19,999       0.292500   0.233643   1.252  0.21060    
Income$20,000 - $29,999       0.366639   0.246194   1.489  0.13643    
Income$30,000 - $39,999       0.561284   0.251517   2.232  0.02564 *  
Income$40,000 - $49,999       0.248941   0.262933   0.947  0.34375    
Income$50,000 - $59,999       0.480334   0.263582   1.822  0.06840 .  
Income$60,000 - $69,999       0.810553   0.282193   2.872  0.00407 ** 
Income$70,000 and over        1.698275   0.225733   7.523 5.34e-14 ***
EmploymentEmployed full time  0.746472   0.156473   4.771 1.84e-06 ***
EnglishSpeakNot well          1.427119   0.251087   5.684 1.32e-08 ***
EnglishSpeakVery well         1.842352   0.288990   6.375 1.83e-10 ***
EnglishSpeakWell              1.534621   0.257461   5.961 2.51e-09 ***
EnglishDiffMuch               0.024480   0.221104   0.111  0.91184    
EnglishDiffNot much          -0.020770   0.209162  -0.099  0.92090    
EnglishDiffVery much         -0.236057   0.205835  -1.147  0.25145    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1855.9  on 2325  degrees of freedom
Residual deviance: 1603.7  on 2298  degrees of freedom
AIC: 1659.7

Number of Fisher Scoring iterations: 5
car::Anova(mod1w)
Analysis of Deviance Table (Type II tests)

Response: Health Insurance
             LR Chisq Df Pr(>Chisq)    
Age             7.426  1   0.006428 ** 
Ethnicity      11.131  5   0.048836 *  
Gender          0.690  1   0.406267    
Religion       10.941  6   0.090209 .  
Income         78.427  7  2.881e-14 ***
Employment     23.838  1  1.048e-06 ***
EnglishSpeak   44.575  3  1.139e-09 ***
EnglishDiff     1.902  3   0.592916    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
car::vif(mod1w)
                  GVIF Df GVIF^(1/(2*Df))
Age           1.322065  1        1.149811
Ethnicity    14.415734  5        1.305821
Gender        1.075038  1        1.036840
Religion     12.092494  6        1.230863
Income        1.434457  7        1.026105
Employment    1.204822  1        1.097644
EnglishSpeak  2.640364  3        1.175648
EnglishDiff   1.842258  3        1.107197
anova(mod1,mod1w,test="LRT")
Analysis of Deviance Table

Model 1: `Health Insurance` ~ Age + Gender + Religion + Income + Employment + 
    EnglishSpeak + EnglishDiff
Model 2: `Health Insurance` ~ Age + Ethnicity + Gender + Religion + Income + 
    Employment + EnglishSpeak + EnglishDiff
  Resid. Df Resid. Dev Df Deviance Pr(>Chi)  
1      2303     1614.9                       
2      2298     1603.8  5   11.131  0.04884 *
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

AIC/BIC values

data.frame(AIC_wo = AIC(mod1), AIC_w=AIC(mod1w))
    AIC_wo    AIC_w
1 1660.878 1659.747
data.frame(BIC_wo = BIC(mod1), BIC_w=BIC(mod1w))
    BIC_wo  BIC_w
1 1793.172 1820.8

Dental Insurance

ps(`Dental Insurance`)
# A tibble: 3 × 3
  `Dental Insurance`     n   pct
  <fct>              <int> <dbl>
1 0                   1050 40.2 
2 Yes                 1529 58.6 
3 <NA>                  30  1.15

Random Forest

Without Ethnicity

Training Set

rfdata <- qol |> 
  select(`Dental Insurance`,Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income, `English Speaking`, `English Difficulties`) %>%
  na.omit() |> 
  rename(Employment=`Full Time Employment`,
         EnglishSpeak=`English Speaking`,
         EnglishDiff=`English Difficulties`)

pos<- rfdata |> filter(`Dental Insurance`=="Yes")
neg <- rfdata |> filter(`Dental Insurance`==0)

set.seed(222)
ind_pos <- sample(2, nrow(pos), replace = TRUE, prob = c(0.7, 0.3))
ind_neg <- sample(2, nrow(neg), replace = TRUE, prob = c(0.7, 0.3))


train <- bind_rows(pos[ind_pos==1,],neg[ind_neg==1,])
test <- bind_rows(pos[ind_pos==2,],neg[ind_neg==2,])

randomForest::randomForest(`Dental Insurance`~Age+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff ,
                           data=train,
                           importance=TRUE) -> rf_wo
print(rf_wo)

Call:
 randomForest(formula = `Dental Insurance` ~ Age + Gender + Religion +      Income + Employment + EnglishSpeak + EnglishDiff, data = train,      importance = TRUE) 
               Type of random forest: classification
                     Number of trees: 500
No. of variables tried at each split: 2

        OOB estimate of  error rate: 26.89%
Confusion matrix:
      0 Yes class.error
0   384 254   0.3981191
Yes 181 799   0.1846939
pred_noeth <- predict(rf_wo,train)
caret::confusionMatrix(pred_noeth,train$`Dental Insurance`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0   543  32
       Yes  95 948
                                          
               Accuracy : 0.9215          
                 95% CI : (0.9073, 0.9341)
    No Information Rate : 0.6057          
    P-Value [Acc > NIR] : < 2.2e-16       
                                          
                  Kappa : 0.8328          
                                          
 Mcnemar's Test P-Value : 3.763e-08       
                                          
            Sensitivity : 0.9673          
            Specificity : 0.8511          
         Pos Pred Value : 0.9089          
         Neg Pred Value : 0.9443          
             Prevalence : 0.6057          
         Detection Rate : 0.5859          
   Detection Prevalence : 0.6446          
      Balanced Accuracy : 0.9092          
                                          
       'Positive' Class : Yes             
                                          

Test set

pred_noeth <- predict(rf_wo,test)
caret::confusionMatrix(pred_noeth,test$`Dental Insurance`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0   167  79
       Yes 110 347
                                          
               Accuracy : 0.7312          
                 95% CI : (0.6967, 0.7636)
    No Information Rate : 0.606           
    P-Value [Acc > NIR] : 2.214e-12       
                                          
                  Kappa : 0.4258          
                                          
 Mcnemar's Test P-Value : 0.0291          
                                          
            Sensitivity : 0.8146          
            Specificity : 0.6029          
         Pos Pred Value : 0.7593          
         Neg Pred Value : 0.6789          
             Prevalence : 0.6060          
         Detection Rate : 0.4936          
   Detection Prevalence : 0.6501          
      Balanced Accuracy : 0.7087          
                                          
       'Positive' Class : Yes             
                                          

ROC Curve

pred_noeth <- predict(rf_wo,test,type="prob")
rocobj_wo <-pROC::roc(test$`Dental Insurance`,pred_noeth[,2])
Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_wo)

With Ethnicity

Training Set

# rfdata <- qol |>
#   select(`Dental Insurance`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income, `English Speaking`, `English Difficulties`) %>%
#   na.omit() |> 
#   rename(Employment=`Full Time Employment`,
#          EnglishSpeak=`English Speaking`,
#          EnglishDiff=`English Difficulties`)
# 
# set.seed(222)
# ind <- sample(2, nrow(rfdata), replace = TRUE, prob = c(0.7, 0.3))
# 
# train <- rfdata[ind==1,]
# test <- rfdata[ind==2,]

randomForest::randomForest(`Dental Insurance`~. ,data=train,
                           importance=TRUE) -> rf_w
print(rf_w)

Call:
 randomForest(formula = `Dental Insurance` ~ ., data = train,      importance = TRUE) 
               Type of random forest: classification
                     Number of trees: 500
No. of variables tried at each split: 2

        OOB estimate of  error rate: 26.14%
Confusion matrix:
      0 Yes class.error
0   387 251   0.3934169
Yes 172 808   0.1755102
pred_eth <- predict(rf_w,train)
caret::confusionMatrix(pred_eth,train$`Dental Insurance`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0   567  19
       Yes  71 961
                                         
               Accuracy : 0.9444         
                 95% CI : (0.9321, 0.955)
    No Information Rate : 0.6057         
    P-Value [Acc > NIR] : < 2.2e-16      
                                         
                  Kappa : 0.8819         
                                         
 Mcnemar's Test P-Value : 7.621e-08      
                                         
            Sensitivity : 0.9806         
            Specificity : 0.8887         
         Pos Pred Value : 0.9312         
         Neg Pred Value : 0.9676         
             Prevalence : 0.6057         
         Detection Rate : 0.5939         
   Detection Prevalence : 0.6378         
      Balanced Accuracy : 0.9347         
                                         
       'Positive' Class : Yes            
                                         
pred_eth <- predict(rf_w,test)
caret::confusionMatrix(pred_eth,test$`Dental Insurance`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0   168  76
       Yes 109 350
                                          
               Accuracy : 0.7368          
                 95% CI : (0.7026, 0.7691)
    No Information Rate : 0.606           
    P-Value [Acc > NIR] : 2.077e-13       
                                          
                  Kappa : 0.4372          
                                          
 Mcnemar's Test P-Value : 0.01864         
                                          
            Sensitivity : 0.8216          
            Specificity : 0.6065          
         Pos Pred Value : 0.7625          
         Neg Pred Value : 0.6885          
             Prevalence : 0.6060          
         Detection Rate : 0.4979          
   Detection Prevalence : 0.6529          
      Balanced Accuracy : 0.7140          
                                          
       'Positive' Class : Yes             
                                          

ROC Curve

pred_eth <- predict(rf_w,test,type="prob")
rocobj <-pROC::roc(test$`Dental Insurance`,pred_eth[,2])
Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(list(NoEthnicity=rocobj_wo,Ethnicity=rocobj))

AUROC

pROC::auc(rocobj)
Area under the curve: 0.7998
pROC::auc(rocobj_wo)
Area under the curve: 0.7967

Variable Importance

randomForest::varImpPlot(rf_w)

randomForest::importance(rf_w)
                     0       Yes MeanDecreaseAccuracy MeanDecreaseGini
Ethnicity     5.994484 17.831023            19.295433         63.13087
Age           7.250900 31.263847            31.737507        139.82902
Gender       -6.854679 12.278566             5.521210         22.30855
Religion     -2.362729 13.833300             9.637888         61.08582
Employment   13.831350 31.792709            37.500551         38.80522
Income       38.744005 46.182120            57.569929        159.48297
EnglishSpeak 21.815707 19.476879            32.219878         62.49798
EnglishDiff   3.099861  6.366257             7.458982         48.81062

Logistic Regression

No ethnicity

Training Set

mod1 <- glm(`Dental Insurance`~Age+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff,data=train,family=binomial) 

predict(mod1,train,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()

caret::confusionMatrix(pred_noeth,train$`Dental Insurance`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0   395 184
       Yes 243 796
                                          
               Accuracy : 0.7361          
                 95% CI : (0.7139, 0.7574)
    No Information Rate : 0.6057          
    P-Value [Acc > NIR] : < 2.2e-16       
                                          
                  Kappa : 0.4384          
                                          
 Mcnemar's Test P-Value : 0.005003        
                                          
            Sensitivity : 0.8122          
            Specificity : 0.6191          
         Pos Pred Value : 0.7661          
         Neg Pred Value : 0.6822          
             Prevalence : 0.6057          
         Detection Rate : 0.4920          
   Detection Prevalence : 0.6422          
      Balanced Accuracy : 0.7157          
                                          
       'Positive' Class : Yes             
                                          

Test Set

predict(mod1,test,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()

caret::confusionMatrix(pred_noeth,test$`Dental Insurance`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0   181  76
       Yes  96 350
                                          
               Accuracy : 0.7553          
                 95% CI : (0.7218, 0.7867)
    No Information Rate : 0.606           
    P-Value [Acc > NIR] : <2e-16          
                                          
                  Kappa : 0.4811          
                                          
 Mcnemar's Test P-Value : 0.1474          
                                          
            Sensitivity : 0.8216          
            Specificity : 0.6534          
         Pos Pred Value : 0.7848          
         Neg Pred Value : 0.7043          
             Prevalence : 0.6060          
         Detection Rate : 0.4979          
   Detection Prevalence : 0.6344          
      Balanced Accuracy : 0.7375          
                                          
       'Positive' Class : Yes             
                                          

ROC

rocobj_wo <-pROC::roc(test$`Dental Insurance`,predicted_probs)
Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_wo)

pROC::auc(rocobj_wo)
Area under the curve: 0.8144

With ethnicity

Training Set

mod1 <- glm(`Dental Insurance`~Age+Ethnicity+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff,data=train,family=binomial) 

predict(mod1,train,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()

caret::confusionMatrix(pred_noeth,train$`Dental Insurance`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0   398 175
       Yes 240 805
                                          
               Accuracy : 0.7435          
                 95% CI : (0.7215, 0.7646)
    No Information Rate : 0.6057          
    P-Value [Acc > NIR] : < 2e-16         
                                          
                  Kappa : 0.4533          
                                          
 Mcnemar's Test P-Value : 0.00168         
                                          
            Sensitivity : 0.8214          
            Specificity : 0.6238          
         Pos Pred Value : 0.7703          
         Neg Pred Value : 0.6946          
             Prevalence : 0.6057          
         Detection Rate : 0.4975          
   Detection Prevalence : 0.6459          
      Balanced Accuracy : 0.7226          
                                          
       'Positive' Class : Yes             
                                          

Test Set

predict(mod1,test,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()

caret::confusionMatrix(pred_noeth,test$`Dental Insurance`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0   184  74
       Yes  93 352
                                          
               Accuracy : 0.7624          
                 95% CI : (0.7292, 0.7935)
    No Information Rate : 0.606           
    P-Value [Acc > NIR] : <2e-16          
                                          
                  Kappa : 0.4965          
                                          
 Mcnemar's Test P-Value : 0.1637          
                                          
            Sensitivity : 0.8263          
            Specificity : 0.6643          
         Pos Pred Value : 0.7910          
         Neg Pred Value : 0.7132          
             Prevalence : 0.6060          
         Detection Rate : 0.5007          
   Detection Prevalence : 0.6330          
      Balanced Accuracy : 0.7453          
                                          
       'Positive' Class : Yes             
                                          

ROC

rocobj_w <-pROC::roc(test$`Dental Insurance`,predicted_probs)
Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_w)

AUROC

pROC::auc(rocobj_w)
Area under the curve: 0.815
pROC::auc(rocobj_wo)
Area under the curve: 0.8144

Logistic Regression (ANOVA)

glm(`Dental Insurance`~Age+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff,data=rfdata,family=binomial) -> mod1
glm(`Dental Insurance`~Age+Ethnicity+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff,data=rfdata,family=binomial) ->mod1w

ANOVA analysis

summary(mod1w)

Call:
glm(formula = `Dental Insurance` ~ Age + Ethnicity + Gender + 
    Religion + Income + Employment + EnglishSpeak + EnglishDiff, 
    family = binomial, data = rfdata)

Coefficients:
                              Estimate Std. Error z value Pr(>|z|)    
(Intercept)                  -1.036310   0.382169  -2.712 0.006695 ** 
Age                           0.001404   0.003302   0.425 0.670765    
EthnicityAsian Indian         0.287705   0.332272   0.866 0.386561    
EthnicityFilipino             0.388713   0.243110   1.599 0.109839    
EthnicityKorean              -0.718523   0.169016  -4.251 2.13e-05 ***
EthnicityOther               -0.014080   0.253570  -0.056 0.955719    
EthnicityVietnamese          -0.043980   0.178087  -0.247 0.804940    
GenderMale                   -0.404963   0.106236  -3.812 0.000138 ***
ReligionCatholic              0.063762   0.191971   0.332 0.739781    
ReligionHindu                -0.867447   0.352066  -2.464 0.013744 *  
ReligionMuslim               -0.991203   0.397838  -2.491 0.012721 *  
ReligionNone                 -0.177425   0.193299  -0.918 0.358682    
ReligionOther                -0.466679   0.409932  -1.138 0.254941    
ReligionProtestant           -0.225351   0.193424  -1.165 0.243993    
Income$10,000 - $19,999      -0.593746   0.226870  -2.617 0.008868 ** 
Income$20,000 - $29,999      -0.595717   0.222689  -2.675 0.007470 ** 
Income$30,000 - $39,999      -0.067002   0.216460  -0.310 0.756915    
Income$40,000 - $49,999       0.120388   0.226426   0.532 0.594942    
Income$50,000 - $59,999       0.434508   0.225649   1.926 0.054156 .  
Income$60,000 - $69,999       0.819607   0.225122   3.641 0.000272 ***
Income$70,000 and over        1.572526   0.177728   8.848  < 2e-16 ***
EmploymentEmployed full time  0.971032   0.111288   8.725  < 2e-16 ***
EnglishSpeakNot well          0.831868   0.271836   3.060 0.002212 ** 
EnglishSpeakVery well         1.560834   0.286206   5.454 4.94e-08 ***
EnglishSpeakWell              1.048865   0.272039   3.856 0.000115 ***
EnglishDiffMuch              -0.364049   0.168306  -2.163 0.030540 *  
EnglishDiffNot much          -0.152249   0.158931  -0.958 0.338085    
EnglishDiffVery much         -0.366967   0.157262  -2.333 0.019623 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 3112.9  on 2320  degrees of freedom
Residual deviance: 2405.0  on 2293  degrees of freedom
AIC: 2461

Number of Fisher Scoring iterations: 4
car::Anova(mod1w)
Analysis of Deviance Table (Type II tests)

Response: Dental Insurance
             LR Chisq Df Pr(>Chisq)    
Age             0.181  1  0.6707549    
Ethnicity      32.224  5  5.364e-06 ***
Gender         14.717  1  0.0001249 ***
Religion       11.138  6  0.0842022 .  
Income        253.343  7  < 2.2e-16 ***
Employment     77.918  1  < 2.2e-16 ***
EnglishSpeak   36.779  3  5.125e-08 ***
EnglishDiff     7.928  3  0.0475285 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
car::vif(mod1w)
                  GVIF Df GVIF^(1/(2*Df))
Age           1.216428  1        1.102918
Ethnicity    16.165895  5        1.320870
Gender        1.117911  1        1.057313
Religion     14.529225  6        1.249837
Income        1.399740  7        1.024311
Employment    1.194814  1        1.093075
EnglishSpeak  2.216819  3        1.141883
EnglishDiff   1.814683  3        1.104418
anova(mod1,mod1w,test="LRT")
Analysis of Deviance Table

Model 1: `Dental Insurance` ~ Age + Gender + Religion + Income + Employment + 
    EnglishSpeak + EnglishDiff
Model 2: `Dental Insurance` ~ Age + Ethnicity + Gender + Religion + Income + 
    Employment + EnglishSpeak + EnglishDiff
  Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
1      2298     2437.2                          
2      2293     2404.9  5   32.224 5.364e-06 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

AIC/BIC values

data.frame(AIC_wo = AIC(mod1), AIC_w=AIC(mod1w))
    AIC_wo    AIC_w
1 2483.177 2460.953
data.frame(BIC_wo = BIC(mod1), BIC_w=BIC(mod1w))
    BIC_wo    BIC_w
1 2615.421 2621.946

Urgent Care utilization in the past 12 months

ps(`Urgentcare`)
# A tibble: 3 × 3
  Urgentcare     n   pct
  <fct>      <int> <dbl>
1 0           2112 81.0 
2 Yes          440 16.9 
3 <NA>          57  2.18

Random Forest

Without Ethnicity

Training Set

rfdata <- qol |> 
  select(`Urgentcare`,Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income, `English Speaking`, `English Difficulties`) %>%
  na.omit() |> 
  rename(Employment=`Full Time Employment`,
         EnglishSpeak=`English Speaking`,
         EnglishDiff=`English Difficulties`)

pos<- rfdata |> filter(`Urgentcare`=="Yes")
neg <- rfdata |> filter(`Urgentcare`==0)

set.seed(222)
ind_pos <- sample(2, nrow(pos), replace = TRUE, prob = c(0.7, 0.3))
ind_neg <- sample(2, nrow(neg), replace = TRUE, prob = c(0.7, 0.3))


train <- bind_rows(pos[ind_pos==1,],neg[ind_neg==1,])
test <- bind_rows(pos[ind_pos==2,],neg[ind_neg==2,])

randomForest::randomForest(`Urgentcare`~Age+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff ,
                           data=train,
                           importance=TRUE) -> rf_wo
print(rf_wo)

Call:
 randomForest(formula = Urgentcare ~ Age + Gender + Religion +      Income + Employment + EnglishSpeak + EnglishDiff, data = train,      importance = TRUE) 
               Type of random forest: classification
                     Number of trees: 500
No. of variables tried at each split: 2

        OOB estimate of  error rate: 17.66%
Confusion matrix:
       0 Yes class.error
0   1306  14  0.01060606
Yes  268   9  0.96750903
pred_noeth <- predict(rf_wo,train)
caret::confusionMatrix(pred_noeth,train$`Urgentcare`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction    0  Yes
       0   1320  174
       Yes    0  103
                                          
               Accuracy : 0.891           
                 95% CI : (0.8747, 0.9059)
    No Information Rate : 0.8265          
    P-Value [Acc > NIR] : 3.815e-13       
                                          
                  Kappa : 0.4946          
                                          
 Mcnemar's Test P-Value : < 2.2e-16       
                                          
            Sensitivity : 0.3718          
            Specificity : 1.0000          
         Pos Pred Value : 1.0000          
         Neg Pred Value : 0.8835          
             Prevalence : 0.1735          
         Detection Rate : 0.0645          
   Detection Prevalence : 0.0645          
      Balanced Accuracy : 0.6859          
                                          
       'Positive' Class : Yes             
                                          

Test set

pred_noeth <- predict(rf_wo,test)
caret::confusionMatrix(pred_noeth,test$`Urgentcare`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0   578 117
       Yes   3   2
                                          
               Accuracy : 0.8286          
                 95% CI : (0.7986, 0.8558)
    No Information Rate : 0.83            
    P-Value [Acc > NIR] : 0.5642          
                                          
                  Kappa : 0.0188          
                                          
 Mcnemar's Test P-Value : <2e-16          
                                          
            Sensitivity : 0.016807        
            Specificity : 0.994836        
         Pos Pred Value : 0.400000        
         Neg Pred Value : 0.831655        
             Prevalence : 0.170000        
         Detection Rate : 0.002857        
   Detection Prevalence : 0.007143        
      Balanced Accuracy : 0.505822        
                                          
       'Positive' Class : Yes             
                                          

ROC Curve

pred_noeth <- predict(rf_wo,test,type="prob")
rocobj_wo <-pROC::roc(test$`Urgentcare`,pred_noeth[,2])
Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_wo)

With Ethnicity

Training Set

# rfdata <- qol |>
#   select(`Urgentcare`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income, `English Speaking`, `English Difficulties`) %>%
#   na.omit() |> 
#   rename(Employment=`Full Time Employment`,
#          EnglishSpeak=`English Speaking`,
#          EnglishDiff=`English Difficulties`)
# 
# set.seed(222)
# ind <- sample(2, nrow(rfdata), replace = TRUE, prob = c(0.7, 0.3))
# 
# train <- rfdata[ind==1,]
# test <- rfdata[ind==2,]

randomForest::randomForest(`Urgentcare`~. ,data=train,
                           importance=TRUE) -> rf_w
print(rf_w)

Call:
 randomForest(formula = Urgentcare ~ ., data = train, importance = TRUE) 
               Type of random forest: classification
                     Number of trees: 500
No. of variables tried at each split: 2

        OOB estimate of  error rate: 17.66%
Confusion matrix:
       0 Yes class.error
0   1309  11 0.008333333
Yes  271   6 0.978339350
pred_eth <- predict(rf_w,train)
caret::confusionMatrix(pred_eth,train$`Urgentcare`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction    0  Yes
       0   1320  146
       Yes    0  131
                                          
               Accuracy : 0.9086          
                 95% CI : (0.8934, 0.9223)
    No Information Rate : 0.8265          
    P-Value [Acc > NIR] : < 2.2e-16       
                                          
                  Kappa : 0.5973          
                                          
 Mcnemar's Test P-Value : < 2.2e-16       
                                          
            Sensitivity : 0.47292         
            Specificity : 1.00000         
         Pos Pred Value : 1.00000         
         Neg Pred Value : 0.90041         
             Prevalence : 0.17345         
         Detection Rate : 0.08203         
   Detection Prevalence : 0.08203         
      Balanced Accuracy : 0.73646         
                                          
       'Positive' Class : Yes             
                                          
pred_eth <- predict(rf_w,test)
caret::confusionMatrix(pred_eth,test$`Urgentcare`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0   577 117
       Yes   4   2
                                          
               Accuracy : 0.8271          
                 95% CI : (0.7971, 0.8544)
    No Information Rate : 0.83            
    P-Value [Acc > NIR] : 0.6033          
                                          
                  Kappa : 0.0159          
                                          
 Mcnemar's Test P-Value : <2e-16          
                                          
            Sensitivity : 0.016807        
            Specificity : 0.993115        
         Pos Pred Value : 0.333333        
         Neg Pred Value : 0.831412        
             Prevalence : 0.170000        
         Detection Rate : 0.002857        
   Detection Prevalence : 0.008571        
      Balanced Accuracy : 0.504961        
                                          
       'Positive' Class : Yes             
                                          

ROC Curve

pred_eth <- predict(rf_w,test,type="prob")
rocobj <-pROC::roc(test$`Urgentcare`,pred_eth[,2])
Setting levels: control = 0, case = Yes
Setting direction: controls > cases
pROC::ggroc(list(NoEthnicity=rocobj_wo,Ethnicity=rocobj))

AUROC

pROC::auc(rocobj)
Area under the curve: 0.4858
pROC::auc(rocobj_wo)
Area under the curve: 0.5102

Variable Importance

randomForest::varImpPlot(rf_w)

randomForest::importance(rf_w)
                     0        Yes MeanDecreaseAccuracy MeanDecreaseGini
Ethnicity    10.686415 -6.1763833            8.5003474         35.33908
Age           4.496937  7.7223094            7.6703428        100.89055
Gender       -0.119335  2.2184180            0.8534296         14.10049
Religion     12.831846 -2.7225625           12.0576909         36.31904
Employment    5.839689 -3.5613963            4.4406385         12.21524
Income        3.736587  2.4811992            4.5254163         53.60340
EnglishSpeak 11.614478  0.9775154           11.5592823         25.38791
EnglishDiff   9.580822 -2.3648761            8.3388440         30.57526

Urgent care IS NOT really predicted well. Might as well just say “yes” to everything since accuracy is not significantly different from NIR.

Logistic Regression

No ethnicity

Training Set

mod1 <- glm(`Urgentcare`~Age+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff,data=train,family=binomial) 

predict(mod1,train,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()

caret::confusionMatrix(pred_noeth,train$`Urgentcare`,positive="Yes")
Warning in confusionMatrix.default(pred_noeth, train$Urgentcare, positive =
"Yes"): Levels are not in the same order for reference and data. Refactoring
data to match.
Confusion Matrix and Statistics

          Reference
Prediction    0  Yes
       0   1320  277
       Yes    0    0
                                          
               Accuracy : 0.8265          
                 95% CI : (0.8071, 0.8448)
    No Information Rate : 0.8265          
    P-Value [Acc > NIR] : 0.516           
                                          
                  Kappa : 0               
                                          
 Mcnemar's Test P-Value : <2e-16          
                                          
            Sensitivity : 0.0000          
            Specificity : 1.0000          
         Pos Pred Value :    NaN          
         Neg Pred Value : 0.8265          
             Prevalence : 0.1735          
         Detection Rate : 0.0000          
   Detection Prevalence : 0.0000          
      Balanced Accuracy : 0.5000          
                                          
       'Positive' Class : Yes             
                                          

Test Set

predict(mod1,test,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()

caret::confusionMatrix(pred_noeth,test$`Urgentcare`,positive="Yes")
Warning in confusionMatrix.default(pred_noeth, test$Urgentcare, positive =
"Yes"): Levels are not in the same order for reference and data. Refactoring
data to match.
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0   581 119
       Yes   0   0
                                          
               Accuracy : 0.83            
                 95% CI : (0.8001, 0.8571)
    No Information Rate : 0.83            
    P-Value [Acc > NIR] : 0.5245          
                                          
                  Kappa : 0               
                                          
 Mcnemar's Test P-Value : <2e-16          
                                          
            Sensitivity : 0.00            
            Specificity : 1.00            
         Pos Pred Value :  NaN            
         Neg Pred Value : 0.83            
             Prevalence : 0.17            
         Detection Rate : 0.00            
   Detection Prevalence : 0.00            
      Balanced Accuracy : 0.50            
                                          
       'Positive' Class : Yes             
                                          

ROC

rocobj_wo <-pROC::roc(test$`Urgentcare`,predicted_probs)
Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_wo)

pROC::auc(rocobj_wo)
Area under the curve: 0.5547

With ethnicity

Training Set

mod1 <- glm(`Urgentcare`~Age+Ethnicity+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff,data=train,family=binomial) 

predict(mod1,train,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()

caret::confusionMatrix(pred_noeth,train$`Urgentcare`,positive="Yes")
Warning in confusionMatrix.default(pred_noeth, train$Urgentcare, positive =
"Yes"): Levels are not in the same order for reference and data. Refactoring
data to match.
Confusion Matrix and Statistics

          Reference
Prediction    0  Yes
       0   1320  277
       Yes    0    0
                                          
               Accuracy : 0.8265          
                 95% CI : (0.8071, 0.8448)
    No Information Rate : 0.8265          
    P-Value [Acc > NIR] : 0.516           
                                          
                  Kappa : 0               
                                          
 Mcnemar's Test P-Value : <2e-16          
                                          
            Sensitivity : 0.0000          
            Specificity : 1.0000          
         Pos Pred Value :    NaN          
         Neg Pred Value : 0.8265          
             Prevalence : 0.1735          
         Detection Rate : 0.0000          
   Detection Prevalence : 0.0000          
      Balanced Accuracy : 0.5000          
                                          
       'Positive' Class : Yes             
                                          

Test Set

predict(mod1,test,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()

caret::confusionMatrix(pred_noeth,test$`Urgentcare`,positive="Yes")
Warning in confusionMatrix.default(pred_noeth, test$Urgentcare, positive =
"Yes"): Levels are not in the same order for reference and data. Refactoring
data to match.
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0   581 119
       Yes   0   0
                                          
               Accuracy : 0.83            
                 95% CI : (0.8001, 0.8571)
    No Information Rate : 0.83            
    P-Value [Acc > NIR] : 0.5245          
                                          
                  Kappa : 0               
                                          
 Mcnemar's Test P-Value : <2e-16          
                                          
            Sensitivity : 0.00            
            Specificity : 1.00            
         Pos Pred Value :  NaN            
         Neg Pred Value : 0.83            
             Prevalence : 0.17            
         Detection Rate : 0.00            
   Detection Prevalence : 0.00            
      Balanced Accuracy : 0.50            
                                          
       'Positive' Class : Yes             
                                          

ROC

rocobj_w <-pROC::roc(test$`Urgentcare`,predicted_probs)
Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_w)

pROC::auc(rocobj_w)
Area under the curve: 0.5521

Logistic Regression (ANOVA)

glm(`Urgentcare`~Age+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff,data=rfdata,family=binomial) -> mod1
glm(`Urgentcare`~Age+Ethnicity+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff,data=rfdata,family=binomial) ->mod1w

ANOVA analysis

summary(mod1w)

Call:
glm(formula = Urgentcare ~ Age + Ethnicity + Gender + Religion + 
    Income + Employment + EnglishSpeak + EnglishDiff, family = binomial, 
    data = rfdata)

Coefficients:
                              Estimate Std. Error z value Pr(>|z|)    
(Intercept)                  -2.825864   0.461983  -6.117 9.55e-10 ***
Age                           0.010671   0.003744   2.850  0.00437 ** 
EthnicityAsian Indian         0.074110   0.353655   0.210  0.83402    
EthnicityFilipino             0.285745   0.250026   1.143  0.25310    
EthnicityKorean               0.342753   0.201449   1.701  0.08886 .  
EthnicityOther                0.288352   0.284633   1.013  0.31103    
EthnicityVietnamese           0.433903   0.207332   2.093  0.03637 *  
GenderMale                    0.071345   0.117536   0.607  0.54385    
ReligionCatholic              0.212812   0.204803   1.039  0.29875    
ReligionHindu                 0.114642   0.370370   0.310  0.75692    
ReligionMuslim               -0.414343   0.488870  -0.848  0.39669    
ReligionNone                 -0.225688   0.227351  -0.993  0.32086    
ReligionOther                -0.084914   0.453352  -0.187  0.85142    
ReligionProtestant           -0.061254   0.219467  -0.279  0.78017    
Income$10,000 - $19,999       0.168237   0.278113   0.605  0.54523    
Income$20,000 - $29,999       0.327745   0.271081   1.209  0.22665    
Income$30,000 - $39,999      -0.067060   0.283435  -0.237  0.81297    
Income$40,000 - $49,999       0.256103   0.281591   0.909  0.36309    
Income$50,000 - $59,999       0.141910   0.287804   0.493  0.62196    
Income$60,000 - $69,999       0.287057   0.276183   1.039  0.29863    
Income$70,000 and over        0.144532   0.223021   0.648  0.51694    
EmploymentEmployed full time  0.024963   0.124338   0.201  0.84088    
EnglishSpeakNot well          0.121938   0.333446   0.366  0.71460    
EnglishSpeakVery well         0.591279   0.345074   1.713  0.08662 .  
EnglishSpeakWell              0.431813   0.331860   1.301  0.19319    
EnglishDiffMuch               0.064030   0.183437   0.349  0.72705    
EnglishDiffNot much           0.047657   0.171562   0.278  0.78118    
EnglishDiffVery much         -0.284500   0.173443  -1.640  0.10094    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2111.7  on 2296  degrees of freedom
Residual deviance: 2064.6  on 2269  degrees of freedom
AIC: 2120.6

Number of Fisher Scoring iterations: 4
car::Anova(mod1w)
Analysis of Deviance Table (Type II tests)

Response: Urgentcare
             LR Chisq Df Pr(>Chisq)   
Age            8.0508  1   0.004548 **
Ethnicity      5.5492  5   0.352578   
Gender         0.3682  1   0.543991   
Religion       5.8991  6   0.434584   
Income         3.4588  7   0.839573   
Employment     0.0403  1   0.840883   
EnglishSpeak   6.6399  3   0.084306 . 
EnglishDiff    4.1245  3   0.248332   
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
car::vif(mod1w)
                  GVIF Df GVIF^(1/(2*Df))
Age           1.198076  1        1.094567
Ethnicity    14.821139  5        1.309448
Gender        1.105467  1        1.051412
Religion     13.102885  6        1.239122
Income        1.456945  7        1.027246
Employment    1.240089  1        1.113593
EnglishSpeak  2.319308  3        1.150517
EnglishDiff   1.800663  3        1.102991
anova(mod1,mod1w,test="LRT")
Analysis of Deviance Table

Model 1: Urgentcare ~ Age + Gender + Religion + Income + Employment + 
    EnglishSpeak + EnglishDiff
Model 2: Urgentcare ~ Age + Ethnicity + Gender + Religion + Income + Employment + 
    EnglishSpeak + EnglishDiff
  Resid. Df Resid. Dev Df Deviance Pr(>Chi)
1      2274     2070.2                     
2      2269     2064.6  5   5.5492   0.3526

AIC/BIC values

data.frame(AIC_wo = AIC(mod1), AIC_w=AIC(mod1w))
    AIC_wo   AIC_w
1 2116.189 2120.64
data.frame(BIC_wo = BIC(mod1), BIC_w=BIC(mod1w))
    BIC_wo    BIC_w
1 2248.195 2281.342

Physical Checkup

ps(`Physical Check-up`)
# A tibble: 3 × 3
  `Physical Check-up`     n   pct
  <fct>               <int> <dbl>
1 0                     833 31.9 
2 Yes                  1740 66.7 
3 <NA>                   36  1.38

Random Forest

Without Ethnicity

Training Set

rfdata <- qol |> 
  select(`Physical Check-up`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income, `English Speaking`, `English Difficulties`) %>%
  na.omit() |> 
  rename(Employment=`Full Time Employment`,
         EnglishSpeak=`English Speaking`,
         EnglishDiff=`English Difficulties`)

pos<- rfdata |> filter(`Physical Check-up`=="Yes")
neg <- rfdata |> filter(`Physical Check-up`==0)

set.seed(222)
ind_pos <- sample(2, nrow(pos), replace = TRUE, prob = c(0.7, 0.3))
ind_neg <- sample(2, nrow(neg), replace = TRUE, prob = c(0.7, 0.3))


train <- bind_rows(pos[ind_pos==1,],neg[ind_neg==1,])
test <- bind_rows(pos[ind_pos==2,],neg[ind_neg==2,])

randomForest::randomForest(`Physical Check-up`~Age+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff ,
                           data=train,
                           importance=TRUE) -> rf_wo
print(rf_wo)

Call:
 randomForest(formula = `Physical Check-up` ~ Age + Gender + Religion +      Income + Employment + EnglishSpeak + EnglishDiff, data = train,      importance = TRUE) 
               Type of random forest: classification
                     Number of trees: 500
No. of variables tried at each split: 2

        OOB estimate of  error rate: 30.7%
Confusion matrix:
      0 Yes class.error
0   154 365   0.7032755
Yes 129 961   0.1183486
pred_noeth <- predict(rf_wo,train)
caret::confusionMatrix(pred_noeth,train$`Physical Check-up`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction    0  Yes
       0    385    5
       Yes  134 1085
                                          
               Accuracy : 0.9136          
                 95% CI : (0.8988, 0.9269)
    No Information Rate : 0.6774          
    P-Value [Acc > NIR] : < 2.2e-16       
                                          
                  Kappa : 0.7886          
                                          
 Mcnemar's Test P-Value : < 2.2e-16       
                                          
            Sensitivity : 0.9954          
            Specificity : 0.7418          
         Pos Pred Value : 0.8901          
         Neg Pred Value : 0.9872          
             Prevalence : 0.6774          
         Detection Rate : 0.6743          
   Detection Prevalence : 0.7576          
      Balanced Accuracy : 0.8686          
                                          
       'Positive' Class : Yes             
                                          

Test set

pred_noeth <- predict(rf_wo,test)
caret::confusionMatrix(pred_noeth,test$`Physical Check-up`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0    60  67
       Yes 170 406
                                          
               Accuracy : 0.6629          
                 95% CI : (0.6266, 0.6978)
    No Information Rate : 0.6728          
    P-Value [Acc > NIR] : 0.7277          
                                          
                  Kappa : 0.1347          
                                          
 Mcnemar's Test P-Value : 3.458e-11       
                                          
            Sensitivity : 0.8584          
            Specificity : 0.2609          
         Pos Pred Value : 0.7049          
         Neg Pred Value : 0.4724          
             Prevalence : 0.6728          
         Detection Rate : 0.5775          
   Detection Prevalence : 0.8193          
      Balanced Accuracy : 0.5596          
                                          
       'Positive' Class : Yes             
                                          

ROC Curve

pred_noeth <- predict(rf_wo,test,type="prob")
rocobj_wo <-pROC::roc(test$`Physical Check-up`,pred_noeth[,2])
Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_wo)

With Ethnicity

Training Set

# rfdata <- qol |>
#   select(`Physical Check-up`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income, `English Speaking`, `English Difficulties`) %>%
#   na.omit() |> 
#   rename(Employment=`Full Time Employment`,
#          EnglishSpeak=`English Speaking`,
#          EnglishDiff=`English Difficulties`)
# 
# set.seed(222)
# ind <- sample(2, nrow(rfdata), replace = TRUE, prob = c(0.7, 0.3))
# 
# train <- rfdata[ind==1,]
# test <- rfdata[ind==2,]

randomForest::randomForest(`Physical Check-up`~. ,data=train,
                           importance=TRUE) -> rf_w
print(rf_w)

Call:
 randomForest(formula = `Physical Check-up` ~ ., data = train,      importance = TRUE) 
               Type of random forest: classification
                     Number of trees: 500
No. of variables tried at each split: 2

        OOB estimate of  error rate: 30.39%
Confusion matrix:
      0 Yes class.error
0   163 356   0.6859345
Yes 133 957   0.1220183
pred_eth <- predict(rf_w,train)
caret::confusionMatrix(pred_eth,train$`Physical Check-up`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction    0  Yes
       0    409    4
       Yes  110 1086
                                          
               Accuracy : 0.9291          
                 95% CI : (0.9155, 0.9412)
    No Information Rate : 0.6774          
    P-Value [Acc > NIR] : < 2.2e-16       
                                          
                  Kappa : 0.8287          
                                          
 Mcnemar's Test P-Value : < 2.2e-16       
                                          
            Sensitivity : 0.9963          
            Specificity : 0.7881          
         Pos Pred Value : 0.9080          
         Neg Pred Value : 0.9903          
             Prevalence : 0.6774          
         Detection Rate : 0.6750          
   Detection Prevalence : 0.7433          
      Balanced Accuracy : 0.8922          
                                          
       'Positive' Class : Yes             
                                          
pred_eth <- predict(rf_w,test)
caret::confusionMatrix(pred_eth,test$`Physical Check-up`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0    61  65
       Yes 169 408
                                          
               Accuracy : 0.6671          
                 95% CI : (0.6309, 0.7019)
    No Information Rate : 0.6728          
    P-Value [Acc > NIR] : 0.6427          
                                          
                  Kappa : 0.1446          
                                          
 Mcnemar's Test P-Value : 1.658e-11       
                                          
            Sensitivity : 0.8626          
            Specificity : 0.2652          
         Pos Pred Value : 0.7071          
         Neg Pred Value : 0.4841          
             Prevalence : 0.6728          
         Detection Rate : 0.5804          
   Detection Prevalence : 0.8208          
      Balanced Accuracy : 0.5639          
                                          
       'Positive' Class : Yes             
                                          

ROC Curve

pred_eth <- predict(rf_w,test,type="prob")
rocobj <-pROC::roc(test$`Physical Check-up`,pred_eth[,2])
Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(list(NoEthnicity=rocobj_wo,Ethnicity=rocobj))

AUROC

pROC::auc(rocobj)
Area under the curve: 0.6691
pROC::auc(rocobj_wo)
Area under the curve: 0.647

Variable Importance

randomForest::varImpPlot(rf_w)

randomForest::importance(rf_w)
                      0      Yes MeanDecreaseAccuracy MeanDecreaseGini
Ethnicity    -2.1405730 12.25127            10.297182         58.32849
Age           5.5814758 28.76194            27.503520        148.18282
Gender       -1.4156418 12.00199             8.760002         23.49004
Religion     -7.6574489 14.90969             8.434901         63.27393
Employment   -8.4761204 19.80178            15.557200         20.80459
Income        1.7335994 26.88213            23.456748         96.62966
EnglishSpeak -3.6512705 17.67902            15.079959         43.68058
EnglishDiff   0.9252643 12.65443            11.215809         49.64512

No change between accuracy and no information rate on the test set.

Logistic Regression

No ethnicity

Training Set

mod1 <- glm(`Physical Check-up`~Age+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff,data=train,family=binomial) 

predict(mod1,train,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()

caret::confusionMatrix(pred_noeth,train$`Physical Check-up`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0   183 110
       Yes 336 980
                                          
               Accuracy : 0.7228          
                 95% CI : (0.7002, 0.7446)
    No Information Rate : 0.6774          
    P-Value [Acc > NIR] : 4.52e-05        
                                          
                  Kappa : 0.2841          
                                          
 Mcnemar's Test P-Value : < 2.2e-16       
                                          
            Sensitivity : 0.8991          
            Specificity : 0.3526          
         Pos Pred Value : 0.7447          
         Neg Pred Value : 0.6246          
             Prevalence : 0.6774          
         Detection Rate : 0.6091          
   Detection Prevalence : 0.8179          
      Balanced Accuracy : 0.6258          
                                          
       'Positive' Class : Yes             
                                          

Test Set

predict(mod1,test,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()

caret::confusionMatrix(pred_noeth,test$`Physical Check-up`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0    60  65
       Yes 170 408
                                          
               Accuracy : 0.6657          
                 95% CI : (0.6295, 0.7005)
    No Information Rate : 0.6728          
    P-Value [Acc > NIR] : 0.6721          
                                          
                  Kappa : 0.1398          
                                          
 Mcnemar's Test P-Value : 1.167e-11       
                                          
            Sensitivity : 0.8626          
            Specificity : 0.2609          
         Pos Pred Value : 0.7059          
         Neg Pred Value : 0.4800          
             Prevalence : 0.6728          
         Detection Rate : 0.5804          
   Detection Prevalence : 0.8222          
      Balanced Accuracy : 0.5617          
                                          
       'Positive' Class : Yes             
                                          

ROC

rocobj_wo <-pROC::roc(test$`Physical Check-up`,predicted_probs)
Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_wo)

pROC::auc(rocobj_wo)
Area under the curve: 0.6736

With ethnicity

Training Set

mod1 <- glm(`Physical Check-up`~Age+Ethnicity+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff,data=train,family=binomial) 

predict(mod1,train,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()

caret::confusionMatrix(pred_noeth,train$`Physical Check-up`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0   191 116
       Yes 328 974
                                          
               Accuracy : 0.7241          
                 95% CI : (0.7015, 0.7458)
    No Information Rate : 0.6774          
    P-Value [Acc > NIR] : 2.852e-05       
                                          
                  Kappa : 0.2929          
                                          
 Mcnemar's Test P-Value : < 2.2e-16       
                                          
            Sensitivity : 0.8936          
            Specificity : 0.3680          
         Pos Pred Value : 0.7481          
         Neg Pred Value : 0.6221          
             Prevalence : 0.6774          
         Detection Rate : 0.6053          
   Detection Prevalence : 0.8092          
      Balanced Accuracy : 0.6308          
                                          
       'Positive' Class : Yes             
                                          

Test Set

predict(mod1,test,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()

caret::confusionMatrix(pred_noeth,test$`Physical Check-up`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0    72  64
       Yes 158 409
                                          
               Accuracy : 0.6842          
                 95% CI : (0.6484, 0.7184)
    No Information Rate : 0.6728          
    P-Value [Acc > NIR] : 0.2743          
                                          
                  Kappa : 0.1986          
                                          
 Mcnemar's Test P-Value : 4.327e-10       
                                          
            Sensitivity : 0.8647          
            Specificity : 0.3130          
         Pos Pred Value : 0.7213          
         Neg Pred Value : 0.5294          
             Prevalence : 0.6728          
         Detection Rate : 0.5818          
   Detection Prevalence : 0.8065          
      Balanced Accuracy : 0.5889          
                                          
       'Positive' Class : Yes             
                                          

ROC

rocobj_w <-pROC::roc(test$`Physical Check-up`,predicted_probs)
Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_w)

AUROC

pROC::auc(rocobj_w)
Area under the curve: 0.6884
pROC::auc(rocobj_wo)
Area under the curve: 0.6736

Logistic Regression (ANOVA)

glm(`Physical Check-up`~Age+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff,data=rfdata,family=binomial) -> mod1
glm(`Physical Check-up`~Age+Ethnicity+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff,data=rfdata,family=binomial) ->mod1w

ANOVA analysis

summary(mod1w)

Call:
glm(formula = `Physical Check-up` ~ Age + Ethnicity + Gender + 
    Religion + Income + Employment + EnglishSpeak + EnglishDiff, 
    family = binomial, data = rfdata)

Coefficients:
                              Estimate Std. Error z value Pr(>|z|)    
(Intercept)                  -1.670919   0.360385  -4.636 3.54e-06 ***
Age                           0.036470   0.003463  10.531  < 2e-16 ***
EthnicityAsian Indian        -0.429756   0.284941  -1.508 0.131496    
EthnicityFilipino            -0.070381   0.232562  -0.303 0.762171    
EthnicityKorean              -0.696159   0.163307  -4.263 2.02e-05 ***
EthnicityOther               -0.497847   0.238551  -2.087 0.036892 *  
EthnicityVietnamese           0.310191   0.179550   1.728 0.084060 .  
GenderMale                   -0.691064   0.102474  -6.744 1.54e-11 ***
ReligionCatholic              0.277402   0.194246   1.428 0.153264    
ReligionHindu                 0.313433   0.306855   1.021 0.307049    
ReligionMuslim                0.380113   0.369009   1.030 0.302968    
ReligionNone                  0.196886   0.189899   1.037 0.299833    
ReligionOther                -0.108269   0.371725  -0.291 0.770851    
ReligionProtestant            0.072264   0.191630   0.377 0.706099    
Income$10,000 - $19,999      -0.140279   0.213287  -0.658 0.510730    
Income$20,000 - $29,999      -0.059952   0.216651  -0.277 0.781993    
Income$30,000 - $39,999       0.223262   0.216758   1.030 0.303008    
Income$40,000 - $49,999       0.022214   0.228213   0.097 0.922458    
Income$50,000 - $59,999       0.377589   0.227892   1.657 0.097545 .  
Income$60,000 - $69,999       0.184929   0.221144   0.836 0.403021    
Income$70,000 and over        0.917047   0.175370   5.229 1.70e-07 ***
EmploymentEmployed full time  0.190892   0.109115   1.749 0.080211 .  
EnglishSpeakNot well          0.880715   0.234409   3.757 0.000172 ***
EnglishSpeakVery well         1.395289   0.254386   5.485 4.14e-08 ***
EnglishSpeakWell              1.205357   0.238229   5.060 4.20e-07 ***
EnglishDiffMuch              -0.264243   0.165476  -1.597 0.110296    
EnglishDiffNot much          -0.558953   0.153838  -3.633 0.000280 ***
EnglishDiffVery much         -0.442620   0.147776  -2.995 0.002743 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2912.3  on 2311  degrees of freedom
Residual deviance: 2573.3  on 2284  degrees of freedom
AIC: 2629.3

Number of Fisher Scoring iterations: 4
car::Anova(mod1w)
Analysis of Deviance Table (Type II tests)

Response: Physical Check-up
             LR Chisq Df Pr(>Chisq)    
Age           123.064  1  < 2.2e-16 ***
Ethnicity      38.328  5  3.242e-07 ***
Gender         46.509  1  9.122e-12 ***
Religion        4.232  6  0.6452799    
Income         65.406  7  1.246e-11 ***
Employment      3.063  1  0.0800877 .  
EnglishSpeak   32.444  3  4.220e-07 ***
EnglishDiff    17.111  3  0.0006705 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
car::vif(mod1w)
                  GVIF Df GVIF^(1/(2*Df))
Age           1.328999  1        1.152822
Ethnicity    14.401679  5        1.305694
Gender        1.141211  1        1.068275
Religion     12.567268  6        1.234819
Income        1.493873  7        1.029084
Employment    1.280847  1        1.131745
EnglishSpeak  2.581069  3        1.171206
EnglishDiff   1.846083  3        1.107580
anova(mod1,mod1w,test="LRT")
Analysis of Deviance Table

Model 1: `Physical Check-up` ~ Age + Gender + Religion + Income + Employment + 
    EnglishSpeak + EnglishDiff
Model 2: `Physical Check-up` ~ Age + Ethnicity + Gender + Religion + Income + 
    Employment + EnglishSpeak + EnglishDiff
  Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
1      2289     2611.6                          
2      2284     2573.3  5   38.328 3.242e-07 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

AIC/BIC values

data.frame(AIC_wo = AIC(mod1), AIC_w=AIC(mod1w))
    AIC_wo    AIC_w
1 2657.606 2629.278
data.frame(BIC_wo = BIC(mod1), BIC_w=BIC(mod1w))
    BIC_wo    BIC_w
1 2789.761 2790.162

Dental Checkup

ps(`Dentist Check-up`)
# A tibble: 3 × 3
  `Dentist Check-up`     n   pct
  <fct>              <int> <dbl>
1 0                   1100 42.2 
2 Yes                 1462 56.0 
3 <NA>                  47  1.80

Random Forest

Without Ethnicity

Training Set

rfdata <- qol |> 
  select(`Dentist Check-up`,Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income, `English Speaking`, `English Difficulties`) %>%
  na.omit() |> 
  rename(Employment=`Full Time Employment`,
         EnglishSpeak=`English Speaking`,
         EnglishDiff=`English Difficulties`)

pos<- rfdata |> filter(`Dentist Check-up`=="Yes")
neg <- rfdata |> filter(`Dentist Check-up`==0)

set.seed(222)
ind_pos <- sample(2, nrow(pos), replace = TRUE, prob = c(0.7, 0.3))
ind_neg <- sample(2, nrow(neg), replace = TRUE, prob = c(0.7, 0.3))


train <- bind_rows(pos[ind_pos==1,],neg[ind_neg==1,])
test <- bind_rows(pos[ind_pos==2,],neg[ind_neg==2,])

randomForest::randomForest(`Dentist Check-up`~Age+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff ,
                           data=train,
                           importance=TRUE) -> rf_wo
print(rf_wo)

Call:
 randomForest(formula = `Dentist Check-up` ~ Age + Gender + Religion +      Income + Employment + EnglishSpeak + EnglishDiff, data = train,      importance = TRUE) 
               Type of random forest: classification
                     Number of trees: 500
No. of variables tried at each split: 2

        OOB estimate of  error rate: 32.75%
Confusion matrix:
      0 Yes class.error
0   352 311   0.4690799
Yes 214 726   0.2276596
pred_noeth <- predict(rf_wo,train)
caret::confusionMatrix(pred_noeth,train$`Dentist Check-up`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0   567  44
       Yes  96 896
                                         
               Accuracy : 0.9127         
                 95% CI : (0.8978, 0.926)
    No Information Rate : 0.5864         
    P-Value [Acc > NIR] : < 2.2e-16      
                                         
                  Kappa : 0.8178         
                                         
 Mcnemar's Test P-Value : 1.63e-05       
                                         
            Sensitivity : 0.9532         
            Specificity : 0.8552         
         Pos Pred Value : 0.9032         
         Neg Pred Value : 0.9280         
             Prevalence : 0.5864         
         Detection Rate : 0.5590         
   Detection Prevalence : 0.6188         
      Balanced Accuracy : 0.9042         
                                         
       'Positive' Class : Yes            
                                         

Test set

pred_noeth <- predict(rf_wo,test)
caret::confusionMatrix(pred_noeth,test$`Dentist Check-up`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0   145  99
       Yes 149 309
                                          
               Accuracy : 0.6467          
                 95% CI : (0.6101, 0.6821)
    No Information Rate : 0.5812          
    P-Value [Acc > NIR] : 0.0002248       
                                          
                  Kappa : 0.2566          
                                          
 Mcnemar's Test P-Value : 0.0018614       
                                          
            Sensitivity : 0.7574          
            Specificity : 0.4932          
         Pos Pred Value : 0.6747          
         Neg Pred Value : 0.5943          
             Prevalence : 0.5812          
         Detection Rate : 0.4402          
   Detection Prevalence : 0.6524          
      Balanced Accuracy : 0.6253          
                                          
       'Positive' Class : Yes             
                                          

ROC Curve

pred_noeth <- predict(rf_wo,test,type="prob")
rocobj_wo <-pROC::roc(test$`Dentist Check-up`,pred_noeth[,2])
Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_wo)

With Ethnicity

Training Set

# rfdata <- qol |>
#   select(`Dentist Check-up`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income, `English Speaking`, `English Difficulties`) %>%
#   na.omit() |> 
#   rename(Employment=`Full Time Employment`,
#          EnglishSpeak=`English Speaking`,
#          EnglishDiff=`English Difficulties`)
# 
# set.seed(222)
# ind <- sample(2, nrow(rfdata), replace = TRUE, prob = c(0.7, 0.3))
# 
# train <- rfdata[ind==1,]
# test <- rfdata[ind==2,]

randomForest::randomForest(`Dentist Check-up`~. ,data=train,
                           importance=TRUE) -> rf_w
print(rf_w)

Call:
 randomForest(formula = `Dentist Check-up` ~ ., data = train,      importance = TRUE) 
               Type of random forest: classification
                     Number of trees: 500
No. of variables tried at each split: 2

        OOB estimate of  error rate: 32.81%
Confusion matrix:
      0 Yes class.error
0   354 309   0.4660633
Yes 217 723   0.2308511
pred_eth <- predict(rf_w,train)
caret::confusionMatrix(pred_eth,train$`Dentist Check-up`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0   593  45
       Yes  70 895
                                          
               Accuracy : 0.9283          
                 95% CI : (0.9145, 0.9404)
    No Information Rate : 0.5864          
    P-Value [Acc > NIR] : < 2e-16         
                                          
                  Kappa : 0.8513          
                                          
 Mcnemar's Test P-Value : 0.02522         
                                          
            Sensitivity : 0.9521          
            Specificity : 0.8944          
         Pos Pred Value : 0.9275          
         Neg Pred Value : 0.9295          
             Prevalence : 0.5864          
         Detection Rate : 0.5583          
   Detection Prevalence : 0.6020          
      Balanced Accuracy : 0.9233          
                                          
       'Positive' Class : Yes             
                                          
pred_eth <- predict(rf_w,test)
caret::confusionMatrix(pred_eth,test$`Dentist Check-up`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0   140  93
       Yes 154 315
                                          
               Accuracy : 0.6481          
                 95% CI : (0.6115, 0.6835)
    No Information Rate : 0.5812          
    P-Value [Acc > NIR] : 0.0001672       
                                          
                  Kappa : 0.2557          
                                          
 Mcnemar's Test P-Value : 0.0001347       
                                          
            Sensitivity : 0.7721          
            Specificity : 0.4762          
         Pos Pred Value : 0.6716          
         Neg Pred Value : 0.6009          
             Prevalence : 0.5812          
         Detection Rate : 0.4487          
   Detection Prevalence : 0.6681          
      Balanced Accuracy : 0.6241          
                                          
       'Positive' Class : Yes             
                                          

ROC Curve

pred_eth <- predict(rf_w,test,type="prob")
rocobj <-pROC::roc(test$`Dentist Check-up`,pred_eth[,2])
Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(list(NoEthnicity=rocobj_wo,Ethnicity=rocobj))

AUROC

pROC::auc(rocobj)
Area under the curve: 0.6809
pROC::auc(rocobj_wo)
Area under the curve: 0.6656

Variable Importance

randomForest::varImpPlot(rf_w)

randomForest::importance(rf_w)
                     0       Yes MeanDecreaseAccuracy MeanDecreaseGini
Ethnicity     2.637328 19.611727            23.513297         65.33439
Age          10.744222 22.345648            27.273211        150.85344
Gender        9.136014  2.322054             8.988817         25.43963
Religion     -1.532119 23.866492            23.414587         73.33316
Employment    6.797482  8.650243            12.627362         22.84096
Income       17.154001 27.930689            33.181691        118.57301
EnglishSpeak 11.087684 20.980028            26.048237         57.89318
EnglishDiff   8.755849  7.419699            12.420752         56.88743

Accuracy slightly decreased when ethnicity is added to the model, but AUC increased.

Logistic Regression

No ethnicity

Training Set

mod1 <- glm(`Dentist Check-up`~Age+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff,data=train,family=binomial) 

predict(mod1,train,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()

caret::confusionMatrix(pred_noeth,train$`Dentist Check-up`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0   363 179
       Yes 300 761
                                          
               Accuracy : 0.7012          
                 95% CI : (0.6781, 0.7235)
    No Information Rate : 0.5864          
    P-Value [Acc > NIR] : < 2.2e-16       
                                          
                  Kappa : 0.367           
                                          
 Mcnemar's Test P-Value : 4.183e-08       
                                          
            Sensitivity : 0.8096          
            Specificity : 0.5475          
         Pos Pred Value : 0.7172          
         Neg Pred Value : 0.6697          
             Prevalence : 0.5864          
         Detection Rate : 0.4747          
   Detection Prevalence : 0.6619          
      Balanced Accuracy : 0.6785          
                                          
       'Positive' Class : Yes             
                                          

Test Set

predict(mod1,test,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()

caret::confusionMatrix(pred_noeth,test$`Dentist Check-up`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0   141  93
       Yes 153 315
                                         
               Accuracy : 0.6496         
                 95% CI : (0.613, 0.6849)
    No Information Rate : 0.5812         
    P-Value [Acc > NIR] : 0.0001236      
                                         
                  Kappa : 0.259          
                                         
 Mcnemar's Test P-Value : 0.0001688      
                                         
            Sensitivity : 0.7721         
            Specificity : 0.4796         
         Pos Pred Value : 0.6731         
         Neg Pred Value : 0.6026         
             Prevalence : 0.5812         
         Detection Rate : 0.4487         
   Detection Prevalence : 0.6667         
      Balanced Accuracy : 0.6258         
                                         
       'Positive' Class : Yes            
                                         

ROC

rocobj_wo <-pROC::roc(test$`Dentist Check-up`,predicted_probs)
Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_wo)

pROC::auc(rocobj_wo)
Area under the curve: 0.6846

With ethnicity

Training Set

mod1 <- glm(`Dentist Check-up`~Age+Ethnicity+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff,data=train,family=binomial) 

predict(mod1,train,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()

caret::confusionMatrix(pred_noeth,train$`Dentist Check-up`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0   370 191
       Yes 293 749
                                          
               Accuracy : 0.6981          
                 95% CI : (0.6749, 0.7205)
    No Information Rate : 0.5864          
    P-Value [Acc > NIR] : < 2.2e-16       
                                          
                  Kappa : 0.3631          
                                          
 Mcnemar's Test P-Value : 4.413e-06       
                                          
            Sensitivity : 0.7968          
            Specificity : 0.5581          
         Pos Pred Value : 0.7188          
         Neg Pred Value : 0.6595          
             Prevalence : 0.5864          
         Detection Rate : 0.4672          
   Detection Prevalence : 0.6500          
      Balanced Accuracy : 0.6774          
                                          
       'Positive' Class : Yes             
                                          

Test Set

predict(mod1,test,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()

caret::confusionMatrix(pred_noeth,test$`Dentist Check-up`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0   146  94
       Yes 148 314
                                          
               Accuracy : 0.6553          
                 95% CI : (0.6188, 0.6904)
    No Information Rate : 0.5812          
    P-Value [Acc > NIR] : 3.479e-05       
                                          
                  Kappa : 0.2732          
                                          
 Mcnemar's Test P-Value : 0.0006569       
                                          
            Sensitivity : 0.7696          
            Specificity : 0.4966          
         Pos Pred Value : 0.6797          
         Neg Pred Value : 0.6083          
             Prevalence : 0.5812          
         Detection Rate : 0.4473          
   Detection Prevalence : 0.6581          
      Balanced Accuracy : 0.6331          
                                          
       'Positive' Class : Yes             
                                          

ROC

rocobj_w <-pROC::roc(test$`Dentist Check-up`,predicted_probs)
Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_w)

pROC::auc(rocobj_w)
Area under the curve: 0.6887

Logistic Regression (ANOVA)

glm(`Dentist Check-up`~Age+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff,data=rfdata,family=binomial) -> mod1
glm(`Dentist Check-up`~Age+Ethnicity+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff,data=rfdata,family=binomial) ->mod1w

ANOVA analysis

summary(mod1w)

Call:
glm(formula = `Dentist Check-up` ~ Age + Ethnicity + Gender + 
    Religion + Income + Employment + EnglishSpeak + EnglishDiff, 
    family = binomial, data = rfdata)

Coefficients:
                              Estimate Std. Error z value Pr(>|z|)    
(Intercept)                  -1.250239   0.353450  -3.537 0.000404 ***
Age                           0.022191   0.003169   7.003 2.50e-12 ***
EthnicityAsian Indian        -0.751432   0.279241  -2.691 0.007124 ** 
EthnicityFilipino            -0.295595   0.220973  -1.338 0.180995    
EthnicityKorean              -0.703159   0.160825  -4.372 1.23e-05 ***
EthnicityOther               -0.425536   0.238447  -1.785 0.074324 .  
EthnicityVietnamese          -0.254040   0.171791  -1.479 0.139202    
GenderMale                   -0.571692   0.098374  -5.811 6.19e-09 ***
ReligionCatholic             -0.086887   0.183688  -0.473 0.636202    
ReligionHindu                -0.843103   0.298541  -2.824 0.004742 ** 
ReligionMuslim               -0.481085   0.361333  -1.331 0.183051    
ReligionNone                 -0.226916   0.185576  -1.223 0.221419    
ReligionOther                -0.728813   0.372626  -1.956 0.050479 .  
ReligionProtestant           -0.310774   0.185933  -1.671 0.094636 .  
Income$10,000 - $19,999      -0.523858   0.211866  -2.473 0.013414 *  
Income$20,000 - $29,999      -0.430694   0.212835  -2.024 0.043011 *  
Income$30,000 - $39,999       0.056685   0.208564   0.272 0.785786    
Income$40,000 - $49,999      -0.149626   0.221115  -0.677 0.498603    
Income$50,000 - $59,999       0.503876   0.222652   2.263 0.023632 *  
Income$60,000 - $69,999       0.089518   0.216328   0.414 0.679015    
Income$70,000 and over        0.969868   0.170368   5.693 1.25e-08 ***
EmploymentEmployed full time  0.288165   0.104906   2.747 0.006017 ** 
EnglishSpeakNot well          1.012791   0.241469   4.194 2.74e-05 ***
EnglishSpeakVery well         1.796948   0.259731   6.919 4.56e-12 ***
EnglishSpeakWell              1.268193   0.243930   5.199 2.00e-07 ***
EnglishDiffMuch              -0.052360   0.155977  -0.336 0.737106    
EnglishDiffNot much          -0.151913   0.145964  -1.041 0.297986    
EnglishDiffVery much         -0.394036   0.140966  -2.795 0.005186 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 3128.8  on 2304  degrees of freedom
Residual deviance: 2725.0  on 2277  degrees of freedom
AIC: 2781

Number of Fisher Scoring iterations: 4
car::Anova(mod1w)
Analysis of Deviance Table (Type II tests)

Response: Dentist Check-up
             LR Chisq Df Pr(>Chisq)    
Age            50.555  1  1.159e-12 ***
Ethnicity      23.080  5  0.0003259 ***
Gender         34.266  1  4.807e-09 ***
Religion       11.443  6  0.0756010 .  
Income        124.620  7  < 2.2e-16 ***
Employment      7.562  1  0.0059627 ** 
EnglishSpeak   54.819  3  7.505e-12 ***
EnglishDiff     8.846  3  0.0314106 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
car::vif(mod1w)
                  GVIF Df GVIF^(1/(2*Df))
Age           1.279353  1        1.131085
Ethnicity    14.729289  5        1.308634
Gender        1.130464  1        1.063233
Religion     12.809561  6        1.236786
Income        1.500217  7        1.029396
Employment    1.276896  1        1.129998
EnglishSpeak  2.483570  3        1.163714
EnglishDiff   1.811780  3        1.104123
anova(mod1,mod1w,test="LRT")
Analysis of Deviance Table

Model 1: `Dentist Check-up` ~ Age + Gender + Religion + Income + Employment + 
    EnglishSpeak + EnglishDiff
Model 2: `Dentist Check-up` ~ Age + Ethnicity + Gender + Religion + Income + 
    Employment + EnglishSpeak + EnglishDiff
  Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
1      2282     2748.0                          
2      2277     2724.9  5    23.08 0.0003259 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

AIC/BIC values

data.frame(AIC_wo = AIC(mod1), AIC_w=AIC(mod1w))
    AIC_wo    AIC_w
1 2794.034 2780.954
data.frame(BIC_wo = BIC(mod1), BIC_w=BIC(mod1w))
    BIC_wo    BIC_w
1 2926.119 2941.754

Folk Medicine

ps(`Folkmedicine`)
# A tibble: 3 × 3
  Folkmedicine     n   pct
  <fct>        <int> <dbl>
1 0             2189 83.9 
2 Yes            348 13.3 
3 <NA>            72  2.76

Random Forest

Without Ethnicity

Training Set

rfdata <- qol |> 
  select(`Folkmedicine`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income, `English Speaking`, `English Difficulties`) %>%
  na.omit() |> 
  rename(Employment=`Full Time Employment`,
         EnglishSpeak=`English Speaking`,
         EnglishDiff=`English Difficulties`)

pos<- rfdata |> filter(`Folkmedicine`=="Yes")
neg <- rfdata |> filter(`Folkmedicine`==0)

set.seed(222)
ind_pos <- sample(2, nrow(pos), replace = TRUE, prob = c(0.7, 0.3))
ind_neg <- sample(2, nrow(neg), replace = TRUE, prob = c(0.7, 0.3))


train <- bind_rows(pos[ind_pos==1,],neg[ind_neg==1,])
test <- bind_rows(pos[ind_pos==2,],neg[ind_neg==2,])

randomForest::randomForest(`Folkmedicine`~Age+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff ,
                           data=train,
                           importance=TRUE) -> rf_wo
print(rf_wo)

Call:
 randomForest(formula = Folkmedicine ~ Age + Gender + Religion +      Income + Employment + EnglishSpeak + EnglishDiff, data = train,      importance = TRUE) 
               Type of random forest: classification
                     Number of trees: 500
No. of variables tried at each split: 2

        OOB estimate of  error rate: 14.05%
Confusion matrix:
       0 Yes class.error
0   1361  11 0.008017493
Yes  212   3 0.986046512
pred_noeth <- predict(rf_wo,train)
caret::confusionMatrix(pred_noeth,train$`Folkmedicine`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction    0  Yes
       0   1372  128
       Yes    0   87
                                          
               Accuracy : 0.9193          
                 95% CI : (0.9048, 0.9323)
    No Information Rate : 0.8645          
    P-Value [Acc > NIR] : 6.136e-12       
                                          
                  Kappa : 0.5403          
                                          
 Mcnemar's Test P-Value : < 2.2e-16       
                                          
            Sensitivity : 0.40465         
            Specificity : 1.00000         
         Pos Pred Value : 1.00000         
         Neg Pred Value : 0.91467         
             Prevalence : 0.13548         
         Detection Rate : 0.05482         
   Detection Prevalence : 0.05482         
      Balanced Accuracy : 0.70233         
                                          
       'Positive' Class : Yes             
                                          

Test set

pred_noeth <- predict(rf_wo,test)
caret::confusionMatrix(pred_noeth,test$`Folkmedicine`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0   598  96
       Yes   2   1
                                          
               Accuracy : 0.8594          
                 95% CI : (0.8314, 0.8844)
    No Information Rate : 0.8608          
    P-Value [Acc > NIR] : 0.5702          
                                          
                  Kappa : 0.0117          
                                          
 Mcnemar's Test P-Value : <2e-16          
                                          
            Sensitivity : 0.010309        
            Specificity : 0.996667        
         Pos Pred Value : 0.333333        
         Neg Pred Value : 0.861671        
             Prevalence : 0.139168        
         Detection Rate : 0.001435        
   Detection Prevalence : 0.004304        
      Balanced Accuracy : 0.503488        
                                          
       'Positive' Class : Yes             
                                          

ROC Curve

pred_noeth <- predict(rf_wo,test,type="prob")
rocobj_wo <-pROC::roc(test$`Folkmedicine`,pred_noeth[,2])
Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_wo)

With Ethnicity

Training Set

# rfdata <- qol |>
#   select(`Folkmedicine`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income, `English Speaking`, `English Difficulties`) %>%
#   na.omit() |> 
#   rename(Employment=`Full Time Employment`,
#          EnglishSpeak=`English Speaking`,
#          EnglishDiff=`English Difficulties`)
# 
# set.seed(222)
# ind <- sample(2, nrow(rfdata), replace = TRUE, prob = c(0.7, 0.3))
# 
# train <- rfdata[ind==1,]
# test <- rfdata[ind==2,]

randomForest::randomForest(`Folkmedicine`~. ,data=train,
                           importance=TRUE) -> rf_w
print(rf_w)

Call:
 randomForest(formula = Folkmedicine ~ ., data = train, importance = TRUE) 
               Type of random forest: classification
                     Number of trees: 500
No. of variables tried at each split: 2

        OOB estimate of  error rate: 14.11%
Confusion matrix:
       0 Yes class.error
0   1360  12 0.008746356
Yes  212   3 0.986046512
pred_eth <- predict(rf_w,train)
caret::confusionMatrix(pred_eth,train$`Folkmedicine`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction    0  Yes
       0   1372   93
       Yes    0  122
                                          
               Accuracy : 0.9414          
                 95% CI : (0.9287, 0.9524)
    No Information Rate : 0.8645          
    P-Value [Acc > NIR] : < 2.2e-16       
                                          
                  Kappa : 0.694           
                                          
 Mcnemar's Test P-Value : < 2.2e-16       
                                          
            Sensitivity : 0.56744         
            Specificity : 1.00000         
         Pos Pred Value : 1.00000         
         Neg Pred Value : 0.93652         
             Prevalence : 0.13548         
         Detection Rate : 0.07687         
   Detection Prevalence : 0.07687         
      Balanced Accuracy : 0.78372         
                                          
       'Positive' Class : Yes             
                                          

Test Set

pred_eth <- predict(rf_w,test)
caret::confusionMatrix(pred_eth,test$`Folkmedicine`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0   595  96
       Yes   5   1
                                          
               Accuracy : 0.8551          
                 95% CI : (0.8267, 0.8804)
    No Information Rate : 0.8608          
    P-Value [Acc > NIR] : 0.6923          
                                          
                  Kappa : 0.0033          
                                          
 Mcnemar's Test P-Value : <2e-16          
                                          
            Sensitivity : 0.010309        
            Specificity : 0.991667        
         Pos Pred Value : 0.166667        
         Neg Pred Value : 0.861071        
             Prevalence : 0.139168        
         Detection Rate : 0.001435        
   Detection Prevalence : 0.008608        
      Balanced Accuracy : 0.500988        
                                          
       'Positive' Class : Yes             
                                          

ROC Curve

pred_eth <- predict(rf_w,test,type="prob")
rocobj <-pROC::roc(test$`Folkmedicine`,pred_eth[,2])
Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(list(NoEthnicity=rocobj_wo,Ethnicity=rocobj))

AUROC

pROC::auc(rocobj)
Area under the curve: 0.6394
pROC::auc(rocobj_wo)
Area under the curve: 0.6139

Variable Importance

randomForest::varImpPlot(rf_w)

randomForest::importance(rf_w)
                     0        Yes MeanDecreaseAccuracy MeanDecreaseGini
Ethnicity    12.726453  6.3822426            15.118640         32.25924
Age          12.115027 10.9895607            15.384077         81.72420
Gender        2.006013  3.8682783             3.404765         12.07102
Religion      3.071722  0.8760851             3.184835         30.79800
Employment   10.548735  1.1738049            10.962190         11.33764
Income        7.361344 -2.8518109             5.614595         46.51250
EnglishSpeak  9.355979 -1.1478322             8.759329         23.88430
EnglishDiff   4.538927  0.6254122             4.520140         26.85372

Accuracy slightly decreased when ethnicity is added to the model.

Logistic Regression

No ethnicity

Training Set

mod1 <- glm(`Folkmedicine`~Age+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff,data=train,family=binomial) 

predict(mod1,train,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()

caret::confusionMatrix(pred_noeth,train$`Folkmedicine`,positive="Yes")
Warning in confusionMatrix.default(pred_noeth, train$Folkmedicine, positive =
"Yes"): Levels are not in the same order for reference and data. Refactoring
data to match.
Confusion Matrix and Statistics

          Reference
Prediction    0  Yes
       0   1372  215
       Yes    0    0
                                         
               Accuracy : 0.8645         
                 95% CI : (0.8467, 0.881)
    No Information Rate : 0.8645         
    P-Value [Acc > NIR] : 0.5182         
                                         
                  Kappa : 0              
                                         
 Mcnemar's Test P-Value : <2e-16         
                                         
            Sensitivity : 0.0000         
            Specificity : 1.0000         
         Pos Pred Value :    NaN         
         Neg Pred Value : 0.8645         
             Prevalence : 0.1355         
         Detection Rate : 0.0000         
   Detection Prevalence : 0.0000         
      Balanced Accuracy : 0.5000         
                                         
       'Positive' Class : Yes            
                                         

Test Set

predict(mod1,test,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()

caret::confusionMatrix(pred_noeth,test$`Folkmedicine`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0   598  96
       Yes   2   1
                                          
               Accuracy : 0.8594          
                 95% CI : (0.8314, 0.8844)
    No Information Rate : 0.8608          
    P-Value [Acc > NIR] : 0.5702          
                                          
                  Kappa : 0.0117          
                                          
 Mcnemar's Test P-Value : <2e-16          
                                          
            Sensitivity : 0.010309        
            Specificity : 0.996667        
         Pos Pred Value : 0.333333        
         Neg Pred Value : 0.861671        
             Prevalence : 0.139168        
         Detection Rate : 0.001435        
   Detection Prevalence : 0.004304        
      Balanced Accuracy : 0.503488        
                                          
       'Positive' Class : Yes             
                                          

ROC

rocobj_wo <-pROC::roc(test$`Folkmedicine`,predicted_probs)
Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_wo)

pROC::auc(rocobj_wo)
Area under the curve: 0.6245

With ethnicity

Training Set

mod1 <- glm(`Folkmedicine`~Age+Ethnicity+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff,data=train,family=binomial) 

predict(mod1,train,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()

caret::confusionMatrix(pred_noeth,train$`Folkmedicine`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction    0  Yes
       0   1369  212
       Yes    3    3
                                         
               Accuracy : 0.8645         
                 95% CI : (0.8467, 0.881)
    No Information Rate : 0.8645         
    P-Value [Acc > NIR] : 0.5182         
                                         
                  Kappa : 0.0199         
                                         
 Mcnemar's Test P-Value : <2e-16         
                                         
            Sensitivity : 0.013953       
            Specificity : 0.997813       
         Pos Pred Value : 0.500000       
         Neg Pred Value : 0.865908       
             Prevalence : 0.135476       
         Detection Rate : 0.001890       
   Detection Prevalence : 0.003781       
      Balanced Accuracy : 0.505883       
                                         
       'Positive' Class : Yes            
                                         

Test Set

predict(mod1,test,type="response") -> predicted_probs
pred_noeth<- ifelse(predicted_probs > 0.5, "Yes", "0") |> as.factor()

caret::confusionMatrix(pred_noeth,test$`Folkmedicine`,positive="Yes")
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0   596  97
       Yes   4   0
                                          
               Accuracy : 0.8551          
                 95% CI : (0.8267, 0.8804)
    No Information Rate : 0.8608          
    P-Value [Acc > NIR] : 0.6923          
                                          
                  Kappa : -0.0111         
                                          
 Mcnemar's Test P-Value : <2e-16          
                                          
            Sensitivity : 0.000000        
            Specificity : 0.993333        
         Pos Pred Value : 0.000000        
         Neg Pred Value : 0.860029        
             Prevalence : 0.139168        
         Detection Rate : 0.000000        
   Detection Prevalence : 0.005739        
      Balanced Accuracy : 0.496667        
                                          
       'Positive' Class : Yes             
                                          

ROC

rocobj_w <-pROC::roc(test$`Folkmedicine`,predicted_probs)
Setting levels: control = 0, case = Yes
Setting direction: controls < cases
pROC::ggroc(rocobj_w)

AUROC

pROC::auc(rocobj_w)
Area under the curve: 0.6418
pROC::auc(rocobj_wo)
Area under the curve: 0.6245

Logistic Regression (ANOVA)

glm(`Folkmedicine`~Age+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff,data=rfdata,family=binomial) -> mod1
glm(`Folkmedicine`~Age+Ethnicity+Gender+Religion + Income +Employment+EnglishSpeak+EnglishDiff,data=rfdata,family=binomial) ->mod1w

ANOVA analysis

summary(mod1w)

Call:
glm(formula = Folkmedicine ~ Age + Ethnicity + Gender + Religion + 
    Income + Employment + EnglishSpeak + EnglishDiff, family = binomial, 
    data = rfdata)

Coefficients:
                             Estimate Std. Error z value Pr(>|z|)    
(Intercept)                  -2.43556    0.46953  -5.187 2.13e-07 ***
Age                           0.02415    0.00416   5.805 6.43e-09 ***
EthnicityAsian Indian        -0.11957    0.42985  -0.278  0.78088    
EthnicityFilipino            -0.58713    0.30650  -1.916  0.05541 .  
EthnicityKorean               0.20188    0.18616   1.084  0.27816    
EthnicityOther               -0.43428    0.33056  -1.314  0.18893    
EthnicityVietnamese          -1.06441    0.24323  -4.376 1.21e-05 ***
GenderMale                   -0.24896    0.13467  -1.849  0.06449 .  
ReligionCatholic             -0.30153    0.26097  -1.155  0.24792    
ReligionHindu                -0.70599    0.46964  -1.503  0.13278    
ReligionMuslim               -2.75096    1.07403  -2.561  0.01043 *  
ReligionNone                 -0.25484    0.23767  -1.072  0.28361    
ReligionOther                 0.20838    0.50280   0.414  0.67855    
ReligionProtestant           -0.17524    0.23207  -0.755  0.45018    
Income$10,000 - $19,999       0.44302    0.31741   1.396  0.16280    
Income$20,000 - $29,999       0.52783    0.33533   1.574  0.11548    
Income$30,000 - $39,999       0.94768    0.31168   3.041  0.00236 ** 
Income$40,000 - $49,999       0.46466    0.34923   1.331  0.18334    
Income$50,000 - $59,999       0.53736    0.33581   1.600  0.10955    
Income$60,000 - $69,999       0.70177    0.33210   2.113  0.03459 *  
Income$70,000 and over        0.61176    0.27203   2.249  0.02452 *  
EmploymentEmployed full time -0.20032    0.14311  -1.400  0.16159    
EnglishSpeakNot well         -0.16989    0.27463  -0.619  0.53617    
EnglishSpeakVery well        -0.50869    0.30937  -1.644  0.10012    
EnglishSpeakWell             -0.13168    0.27963  -0.471  0.63770    
EnglishDiffMuch              -0.05387    0.21241  -0.254  0.79978    
EnglishDiffNot much          -0.12709    0.19831  -0.641  0.52162    
EnglishDiffVery much         -0.13559    0.20930  -0.648  0.51711    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1821.5  on 2283  degrees of freedom
Residual deviance: 1666.8  on 2256  degrees of freedom
AIC: 1722.8

Number of Fisher Scoring iterations: 6
car::Anova(mod1w)
Analysis of Deviance Table (Type II tests)

Response: Folkmedicine
             LR Chisq Df Pr(>Chisq)    
Age            33.881  1  5.860e-09 ***
Ethnicity      30.371  5  1.247e-05 ***
Gender          3.446  1    0.06340 .  
Religion       14.206  6    0.02742 *  
Income         10.679  7    0.15327    
Employment      1.967  1    0.16079    
EnglishSpeak    4.608  3    0.20289    
EnglishDiff     0.670  3    0.88013    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
car::vif(mod1w)
                  GVIF Df GVIF^(1/(2*Df))
Age           1.209700  1        1.099863
Ethnicity    13.559109  5        1.297846
Gender        1.091570  1        1.044782
Religion     11.613667  6        1.226726
Income        1.405444  7        1.024609
Employment    1.218399  1        1.103811
EnglishSpeak  2.479916  3        1.163428
EnglishDiff   1.825507  3        1.105513
anova(mod1,mod1w,test="LRT")
Analysis of Deviance Table

Model 1: Folkmedicine ~ Age + Gender + Religion + Income + Employment + 
    EnglishSpeak + EnglishDiff
Model 2: Folkmedicine ~ Age + Ethnicity + Gender + Religion + Income + 
    Employment + EnglishSpeak + EnglishDiff
  Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
1      2261     1697.2                          
2      2256     1666.8  5   30.371 1.247e-05 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

AIC/BIC values

data.frame(AIC_wo = AIC(mod1), AIC_w=AIC(mod1w))
    AIC_wo    AIC_w
1 1743.189 1722.818
data.frame(BIC_wo = BIC(mod1), BIC_w=BIC(mod1w))
    BIC_wo    BIC_w
1 1875.064 1883.361